Python ESNormalizer Examples

Programming Language: Python

Namespace/Package Name: normalizer

Class/Type: ESNormalizer

Examples at hotexamples.com: 10

Python ESNormalizer - 10 examples found. These are the top rated real world Python examples of normalizer.ESNormalizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ESNormalizer(20)

load_data(8)

write_data(6)

get_last_written(5)

create_index(3)

path_to_last_written(3)

_get_header(2)

make_mini(2)

upload_init_data(2)

transform(2)

delete_project(2)

gen_default_columns_to_index(2)

path_to(2)

run_all_transforms(1)

read_config_data(1)

_write_log_buffer(1)

load_project_to_merge(1)

_write_metadata(1)

linker(1)

_choose_dtype(1)

add_selected_project(1)

add_selected_columns(1)

infer(1)

Example #1

Show file

File: api_queued_modules.py Project: CDonnees/the-magical-csv-merge-machine

def _recode_types(project_id, data_params, module_params):
    '''
    Runs the recoding module
    
    ARGUMENTS (GET):
        project_id: ID for "normalize" project

    ARGUMENTS (POST):
        - data_params: {
                "module_name": module to fetch from
                "file_name": file to fetch
                }
        - module_params: same as result of infer_mvs
    '''
    proj = ESNormalizer(project_id=project_id)

    proj.load_data(data_params['module_name'], data_params['file_name'])

    _, run_info = proj.transform('recode_types', module_params)

    # Write transformations and logs
    proj.write_data()

    return run_info

Example #2

Show file

File: api_queued_modules.py Project: CDonnees/the-magical-csv-merge-machine

def _run_all_transforms(project_id, data_params, *argv):
    '''
    Run all transformations that were already (based on presence of 
    run_info.json files) with parameters in run_info.json files.

    ARGUMENTS (GET):
        project_id: ID for "normalize" project

    ARGUMENTS (POST):
        - data_params: file to concatenate to original
                {
                    "file_name": file to use for transform (module_name is 'INIT')
                }
    '''
    proj = ESNormalizer(project_id=project_id)

    file_name = data_params['file_name']

    proj.load_data('INIT', file_name)
    all_run_infos = proj.run_all_transforms()

    # Write transformations and logs
    proj.write_data()
    return all_run_infos

Example #3

Show file

File: api_queued_modules.py Project: CDonnees/the-magical-csv-merge-machine

def _infer_types(project_id, data_params, module_params):
    '''
    Runs the infer_types module
    
    wrapper around ESNormalizer.infer ?
    
    ARGUMENTS (GET):
        project_id: ID for "normalize" project

    ARGUMENTS (POST):
        - data_params: {
                "module_name": module to fetch from
                "file_name": file to fetch
                }
        - module_params: none
    
    '''
    proj = ESNormalizer(project_id=project_id)
    proj.load_data(data_params['module_name'], data_params['file_name'])
    result = proj.infer('infer_types', module_params)

    # Write log
    proj._write_log_buffer(False)
    return result

Example #4

Show file

File: api.py Project: CDonnees/the-magical-csv-merge-machine

def upload(project_id):
    '''
    Uploads files to a normalization project. (NB: cannot upload directly to 
    a link type project). 
                                               
    Also creates the mini version of the project
    
    GET:
        - project_id: ID of the normalization project
        
    POST:
        
      file: (csv file) A csv to upload to the chosen normalization project
                  NB: the "filename" property will be used to name the file
      json:
        - module_params:
            - make_mini: (default True) Set to False to NOT create a mini version of the file
            - sample_size
            - randomize
    '''
    # Load project
    proj = ESNormalizer(project_id=project_id) 
    _, module_params = _parse_request()   
    if module_params is None:
        module_params = {}
    make_mini = module_params.get('make_mini', True) # TODO: can remove ?
    
    # Upload data        
    def custom_stream_factory(total_content_length, filename, content_type, content_length=None):
        tmpfile = tempfile.NamedTemporaryFile('wb+', prefix='flaskapp')
        app.logger.info("start receiving file ... filename => " + str(tmpfile.name))
        return tmpfile
    
    _, _, files = werkzeug.formparser.parse_form_data(flask.request.environ, stream_factory=custom_stream_factory)
    
    
    # Upload data
    file_name = files['file'].filename
    stream = files['file'].stream
    
    _, run_info = proj.upload_init_data(stream, file_name)
    
    # Make mini
    if make_mini:
        proj.load_data('INIT', run_info['file_name'])
        proj.make_mini(module_params)
        
        # Write transformations and log # TODO: not clean
        if proj.metadata['has_mini']:
            proj.write_data()
        else:
            proj._write_metadata()
    
    return jsonify(run_info=run_info, project_id=proj.project_id)

Example #5

Show file

def _create_es_index(project_id, data_params, module_params):
    '''
    Create an Elasticsearch index for the selected file
    
    GET:
        - project_id: Link project_id
    POST:
        - data_params: 
                        {
                            link_project_id: (optional) ID of the associated link project
                            project_type: (optional) defaults to link
                            module_name:
                            file_name: 
                        }
        - module_params: {
                            columns_to_index: 
                            for_linking: create index to use as referential (instead of storage)
                            force: force recreation of index even if existant
                        }
    '''

    if module_params is None:
        module_params = {}

    print(module_params)
    columns_to_index = module_params.get('columns_to_index')
    force = module_params.get('force', False)
    for_linking = module_params.get('for_linking', True)

    if (not for_linking) and (columns_to_index is not None):
        raise ValueError(
            'columns_to_index and for_linking cannot be NOT None and False')

    if (data_params is not None) and ('project_type' in data_params):
        project_type = data_params['project_type']

    project_type = 'link'
    if data_params is not None:
        module_name = data_params['module_name']
        file_name = data_params['file_name']
        project_type = data_params.get('project_type', 'link')

    # TODO: dirty fix for linking and normalization
    if for_linking:
        if project_type == 'link':
            proj_link = ESLinker(project_id)
            columns_to_index = proj_link.gen_default_columns_to_index()

            if data_params is None:
                module_name = proj_link.metadata['files']['ref']['module_name']
                file_name = proj_link.metadata['files']['ref']['file_name']

            proj = ESNormalizer(proj_link.ref.project_id)

        elif project_type == 'normalize':
            proj = ESNormalizer(project_id)
            assert columns_to_index is not None

    else:
        proj = ESLinker(project_id)
        if data_params is None:
            module_name, file_name = proj.get_last_written()

        # Type non str columns or use the default string analyzer
        types_dict = {float: 'float', bool: 'boolean', int: 'integer'}
        columns_to_index = {col: types_dict.get(proj._choose_dtype(col), {}) \
                            for col in proj._get_header(module_name, file_name)}

    file_path = proj.path_to(module_name, file_name)
    proj.create_index(file_path, columns_to_index, force,
                      proj.metadata.get('public', False))
    return

Example #6

Show file

File: api_queued_modules.py Project: CDonnees/the-magical-csv-merge-machine

def _create_es_index(project_id, data_params, module_params):
    '''
    Create an Elasticsearch index for the selected file
    
    GET:
        - project_id: Link project_id
    POST:
        - data_params: 
                        {
                        project_type: (optional) defaults to link
                        module_name:
                        file_name: 
                        }
        - module_params: {
                            columns_to_index: 
                            for_linking: create index to use as referential (instead of storage)
                            force: force recreation of index even if existant
                        }
    '''

    if module_params is None:
        module_params = {}

    print(module_params)
    columns_to_index = module_params.get('columns_to_index')
    force = module_params.get('force', False)
    for_linking = module_params.get('for_linking', True)

    if (not for_linking) and (columns_to_index is not None):
        raise ValueError(
            'columns_to_index and for_linking cannot be not None and False')

    if (data_params is not None) and ('project_type' in data_params):
        project_type = data_params['project_type']

    project_type = 'link'
    if data_params is not None:
        module_name = data_params['module_name']
        file_name = data_params['file_name']
        project_type = data_params.get('project_type', 'link')

    # TODO: dirty fix for linking and normalization
    if for_linking:
        if project_type == 'link':
            proj_link = ESLinker(project_id)
            proj = ESNormalizer(proj_link.ref.project_id)

            if data_params is None:
                module_name = proj_link.metadata['files']['ref']['module_name']
                file_name = proj_link.metadata['files']['ref']['file_name']

        elif project_type == 'normalize':
            proj = ESNormalizer(project_id)

        # Generate default columns_to_index
        if columns_to_index is None:
            columns_to_index = proj.gen_default_columns_to_index(for_linking)

    else:
        proj = ESLinker(project_id)
        if data_params is None:
            module_name, file_name = proj.get_last_written()

        if columns_to_index is None:
            columns_to_index = {
                col: {}
                for col in proj._get_header(module_name, file_name)
            }

    file_path = proj.path_to(module_name, file_name)
    proj.create_index(file_path, columns_to_index, force)
    time.sleep(5)  # TODO: why is this necessary?
    return

Example #7

Show file

class Linker(ESAbstractDataProject):
    MODULES = LINK_MODULES
    MODULE_ORDER = LINK_MODULE_ORDER
    MODULE_ORDER_log = LINK_MODULE_ORDER_log

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Add source and ref if the were selected
        if (self.metadata['files']['source'] is not None) \
            and (self.metadata['files']['ref'] is not None):
            self.load_project_to_merge('source')
            self.load_project_to_merge('ref')

    def __repr__(self):
        string = '{0}({1})'.format(self.__class__.__name__, self.project_id)

        string += ' / source: '
        if self.source is not None:
            string += self.source.__repr__()
        else:
            string += 'None'

        string += ' / ref: '
        if self.ref is not None:
            string += self.ref.__repr__()
        return string

    def __str__(self):
        string = '{0}; project_id:{1}'.format(self.__class__.__name__,
                                              self.project_id)
        if self.source is not None:
            string += '\n\n***SOURCE***\n{0}'.format(self.source.__str__())
        if self.ref is not None:
            string += '\n\n***REF***\n{0}'.format(self.ref.__str__())
        return string

    @staticmethod
    def output_file_name(source_file_name):
        '''Name of the file to output'''
        return source_file_name

    def load_project_to_merge(self, file_role):
        '''Uses the "current" field in metadata to load source or ref'''
        self._check_file_role(file_role)
        # TODO: Add safeguard somewhere
        # Add source

        if file_role == 'source':
            try:
                self.source = ESNormalizer(
                    self.metadata['files']['source']['project_id'])
            except:
                self.source = None

        if file_role == 'ref':
            try:
                self.ref = ESNormalizer(
                    self.metadata['files']['ref']['project_id'])
            except:
                self.ref = None
            #raise Exception('Normalizer project with id {0} could not be found'.format(project_id))

    @staticmethod
    def _check_file_role(file_role):
        if file_role not in ['ref', 'source']:
            raise Exception('file_role should be either "source" or "ref"')

    def _check_select(self):
        '''Check that a source and referential were selected'''
        for file_role in ['source', 'ref']:
            if self.metadata['files'][file_role] is None:
                raise Exception(
                    '{0} is not defined for this linking project'.format(
                        file_role))

    def _create_metadata(self, *args, **kwargs):
        metadata = super()._create_metadata(*args, **kwargs)
        metadata['files'] = {'source': None, 'ref': None}
        metadata['project_type'] = 'link'
        return metadata

    def add_col_matches(self, column_matches):
        '''
        Adds a configuration file with the column matches between source and
        referential.
        
        INPUT:
            - column_matches: json file as dict
        '''

        # Remove labeller if it exists
        if self._has_labeller():
            self._remove_labeller()

        # TODO: add checks on file
        if (self.source is None) or (self.ref is None):
            raise RuntimeError(
                'source or referential were not loaded (add_selected_project) and/or (load_project_to_merge)'
            )

        # Remove duplicates from columns matches
        column_matches = [{'source': list(set(match['source'])),
                           'ref': list(set(match['ref'])),
                           'exact_only': match.get('exact_only', False)} \
                            for match in column_matches]

        # Remove matches with missing columns on one side or the othre
        column_matches = [match for match in column_matches \
                          if match['source'] and match['ref']]

        if not column_matches:
            raise ValueError("You have to specify at least one pair of columns" \
                             + " in column matches.")

        # Add matches
        self.upload_config_data(column_matches, 'es_linker',
                                'column_matches.json')

        # Select these columns for normalization in source and ref

        # TODO: this will cover add_certain_col_matches
        # Add to log
        for file_name in self.metadata['log']:
            self.metadata['log'][file_name]['add_selected_columns'][
                'completed'] = True
        self._write_metadata()

    def add_es_learned_settings(self, learned_settings):
        '''Adds the learned es configuration'''

        print('trying to upload', learned_settings)

        # TODO: figure out where to move this
        learned_settings['best_thresh'] = 1

        self.upload_config_data(learned_settings, 'es_linker',
                                'learned_settings.json')

        for file_name in self.metadata['log']:
            self.metadata['log'][file_name]['upload_es_train'][
                'completed'] = True
        self._write_metadata()

    def read_col_matches(self, add_created=True):
        '''
        Read the column_matches config file and interprets the columns looking
        for processed (normalized) columns
        '''
        config = self.read_config_data('es_linker', 'column_matches.json')

        if not config:
            config = []

        return config

    def add_col_certain_matches(self, column_matches):
        '''column_matches is a json file as list of dict of list'''
        # TODO: add checks on file
        self.upload_config_data(column_matches, 'es_linker',
                                'column_certain_matches.json')

    def read_col_certain_matches(self):
        config = self.read_config_data('es_linker',
                                       'column_certain_matches.json')
        if not config:
            config = []
        return config

    def read_cols_to_return(self, file_role):
        config_file_name = 'columns_to_return_{0}.json'.format(file_role)
        config = self.read_config_data('es_linker', config_file_name)
        if not config:
            config = []
        return config

    def add_selected_project(self, file_role, public, project_id):
        '''
        Select file to use as source or referential.
        
        INPUT:
            - file_role: "source" or "referential"
            - public: (bool) is the project available to all (or is it a user project)
            - project_id
            - file_name
        '''
        self._check_file_role(file_role)
        # Check that file exists
        if public:
            raise DeprecationWarning
        else:
            proj = ESNormalizer(project_id)

        #        if file_name not in proj.metadata['files']:
        #            raise Exception('File {0} could not be found in project {1} \
        #                 (public: {2})'.format(file_name, project_id, public))

        # Check that normalization project has only one file (and possibly a MINI__ version)
        if not len(proj.metadata['files']):
            raise Exception(
                'The selected normalization project ({0}) has no upload file'.
                format(project_id))
        if len(proj.metadata['files']) > 1:
            raise Exception('The selected normalization project ({0}) has more than one file.'\
                    + ' This method expects projects to have exactly 1 file as it'\
                    + ' uses the implicit get_last_written'.format(project_id))

        # TODO: last written is a bad idea because if we modify normalization then BOOM !
        # TODO: last_written btw concat_with_initi and init ?
        (module_name, file_name) = proj.get_last_written()

        # TODO: add warning for implicit use of not-MINI
        if proj.metadata['has_mini'] and (file_role == 'source'):
            file_name = file_name.replace('MINI__', '')
        if proj.metadata['has_mini'] and (file_role == 'ref'):
            file_name = file_name.replace('MINI__', '')

        # Check that
        self.metadata['files'][file_role] = {
            'public': public,
            'project_id': project_id,
            'module_name': module_name,
            'file_name': file_name,
            'restricted': False
        }

        # Create log for source
        if file_role == 'source':
            self.metadata['log'][self.output_file_name(
                file_name)] = self._default_log()

        # Add project selection
        if (self.metadata['files']['source']
                is not None) and (self.metadata['files']['ref'] is not None):
            for file_name in self.metadata['log']:
                self.metadata['log'][file_name]['INIT']['completed'] = True
        self._write_metadata()
        self.load_project_to_merge(file_role)

    def read_selected_files(self):
        '''
        Returns self.metadata['files']
        '''
        return self.metadata['files']

    def infer(self, module_name, params):
        '''Overwrite to allow restrict_reference'''
        if module_name == 'infer_restriction':
            params['NO_MEM_DATA'] = True
        return super().infer(module_name, params)

    def linker(self, module_name, data_params, module_params):
        '''Wrapper around link methods.'''

        if module_name == 'es_linker':
            return self.es_linker(module_params)
        elif module_name == 'dedupe_linker':
            raise DeprecationWarning

    def es_linker(self, module_params):
        module_params['index_name'] = ESNormalizer(
            self.ref.project_id).index_name

        s = self.metadata['files']['source']

        self.source.load_data(s['module_name'], s['file_name'])

        self.mem_data = self.source.mem_data
        self.mem_data_info = self.source.mem_data_info

        # Change file_name to output file_name
        self.mem_data_info['file_name'] = self.output_file_name(
            self.mem_data_info['file_name'])  # File being modified

        log, run_info = self.transform('es_linker', module_params)

        #print('DEF:', self.mem_data.columns)
        return log, run_info

    #==========================================================================
    #  Module specific: ES Linker
    #==========================================================================

    def _gen_paths_es(self):
        self._check_select()

        # Get path to training file for ES linker
        training_path = self.path_to('es_linker', 'training.json')
        learned_settings_path = self.path_to('es_linker',
                                             'learned_settings.json')

        # TODO: check that normalization projects are complete ?

        # Get path to source
        # TODO: fix this: use current
        file_name = self.metadata['files']['source']['file_name']
        source_path = self.source.path_to_last_written(module_name=None,
                                                       file_name=file_name)

        # Add paths
        paths = {
            'source': source_path,
            'train': training_path,
            'learned_settings': learned_settings_path
        }
        return paths

    @staticmethod
    def _tuple_or_string(x):
        if isinstance(x, str):
            return x
        elif isinstance(x, list):
            if len(x) == 1:
                return x[0]
            else:
                return tuple(x)
        elif isinstance(x, tuple):
            if len(x) == 1:
                return x[0]
            else:
                return x
        else:
            raise ValueError('Value should be str, list or tuple')

    def gen_default_columns_to_index(self):
        '''Generate the dict specifying the analyzers to use for each column 
        while indexing in Elasticsearch. 
        
        This method only takes into account the reference file as to avoid 
        re-indexing when using the same reference with a different source. This 
        could change if partial re-indexing is implemented.
            
        Returns
        -------
        columns_to_index: dict associating sets of str (values) to str (keys)
            A dict indicating what Elasticsearch analyzers to use on each column
            type during indexing.
        '''
        INDEX_ALL = False  # Whether or not to index all selected columns of the file

        def temp(column_types, col):
            """Return the type specific default analyzer for a column or return 
            all default analyzers if type is not specified or could not be found.
            """
            return DEFAULT_ANALYZERS_TYPE.get(column_types.get(col),
                                              DEFAULT_ANALYZERS)

        # Try fetching referential column types
        # TODO: dangerous if config was not confirmed by user...
        column_types = self.ref.read_config_data('recode_types',
                                                 'infered_config.json')

        # Read column match data
        column_matches = self.read_config_data('es_linker',
                                               'column_matches.json')
        if not column_matches:
            raise RuntimeError('No column matches to read from')

        # Add default analyzer for columns that are exact matches

        if INDEX_ALL:
            list_of_columns_exact = self.ref.metadata['column_tracker'][
                'selected']
            list_of_columns_exact = {
                x
                for x in list_of_columns_exact if '__' not in x
            }
        else:
            exact_matches = filter(lambda m: m.get('exact_only', False),
                                   column_matches)
            list_of_columns_exact = {y for z in [[m['ref']] if isinstance(m['ref'], str) \
                                    else m['ref'] for m in exact_matches] for y in z}

        columns_to_index = {col: {} for col in list_of_columns_exact}

        # Add analyzers for columns that are non-exact matches
        # NB: Preserve order to not overwrite columns_to_index of non-exact
        non_exact_matches = filter(lambda m: not m.get('exact_only', False),
                                   column_matches)
        list_of_columns_non_exact = {y for z in [[m['ref']] if isinstance(m['ref'], str) \
                                else m['ref'] for m in non_exact_matches] for y in z}
        columns_to_index.update({
            col: temp(column_types, col)
            for col in list_of_columns_non_exact
        })

        # Add all columns that were selected
        for col in self.ref.metadata['column_tracker']['selected']:
            columns_to_index.setdefault(col, {})

        print('columns_to_index:')
        print(columns_to_index)

        return columns_to_index

    def _gen_es_labeller(self,
                         columns_to_index=None,
                         certain_column_matches=None):
        '''Return a es_labeller object.
        '''
        self._check_select()

        #chunksize = 40000

        col_matches_tmp = self.read_col_matches()
        col_matches = []
        for match in col_matches_tmp:
            col_matches.append({
                'source': self._tuple_or_string(match['source']),
                'ref': self._tuple_or_string(match['ref'])
            })
        # TODO: lists to tuple in col_matches

        paths = self._gen_paths_es()
        source = pd.read_csv(paths['source'],
                             sep=',',
                             encoding='utf-8',
                             dtype=str,
                             nrows=3000)
        source = source.where(source.notnull(), '')

        ref_table_name = self.ref.project_id
        if columns_to_index is None:
            columns_to_index = self.gen_default_columns_to_index()

        print(columns_to_index)

        # TODO: Check that reference is indexed
        # TODO: Restrict columns to index to columns present in reference.

        labeller = ESLabeller(es, source, ref_table_name, col_matches,
                              columns_to_index, certain_column_matches)

        # TODO: Auto label certain pairs

        # TODO: Add pre-load for 3 first queries

        return labeller

    def _has_labeller(self):
        '''Check for json of labeller.'''
        file_path = self.path_to('es_linker', 'labeller.json')
        return os.path.isfile(file_path)

    def _remove_labeller(self):
        '''Remove json version of labeller.'''
        if self._has_labeller():
            self._remove('es_linker', 'labeller.json')

    def labeller_to_json(self, labeller):
        '''Write a Labeller object as a json in the appropriate directory. This
        includes a locking logic to avoid concurrent writes.
        '''
        NUM_RETRY = 10
        RETRY_INTERVAL = 0.1

        file_path = self.path_to('es_linker', 'labeller.json')

        for _ in range(NUM_RETRY):
            try:
                # Lock File before writing
                with open(file_path, 'a') as f:
                    fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)

                # Write file
                labeller.to_json(file_path)

                # Unlock file
                with open(file_path, 'r') as w:
                    fcntl.flock(w, fcntl.LOCK_UN)
                break

            except BlockingIOError:
                time.sleep(RETRY_INTERVAL)
        else:
            raise BlockingIOError('{0} is un-writable because '.format(file_path) \
                                + 'it was locked for by another process.')

    def labeller_from_json(self):
        file_path = self.path_to('es_linker', 'labeller.json')

        paths = self._gen_paths_es()
        source = pd.read_csv(paths['source'],
                             sep=',',
                             encoding='utf-8',
                             dtype=str,
                             nrows=3000)
        source = source.where(source.notnull(), '')

        ref_table_name = self.ref.project_id
        labeller = ESLabeller.from_json(file_path, es, source, ref_table_name)

        return labeller

    def analyze_results(self, params={}):
        # Check that memory is loaded (if necessary)
        self._check_mem_data()

        module_name = 'link_results_analyzer'

        # Initiate log
        log = self._init_active_log(module_name, 'infer')

        complete_metrics = defaultdict(int)

        for data in self.mem_data:
            metrics = link_results_analyzer(data, params)

            for col in ['num_match_thresh', 'num_match', 'num_verif_samples']:
                complete_metrics[col] += metrics[col]

            # Weigh ratios according to the number of samples (we divide after)
            complete_metrics['perc_match_thresh'] += metrics[
                'perc_match_thresh'] * metrics['num_match_thresh']
            complete_metrics[
                'perc_match'] += metrics['perc_match'] * metrics['num_match']
            complete_metrics['precision'] += metrics.get(
                'precision', 0) * metrics['num_verif_samples']

        if complete_metrics['num_match_thresh']:
            complete_metrics['perc_match_thresh'] /= complete_metrics[
                'num_match_thresh']

        if complete_metrics['num_match']:
            complete_metrics['perc_match'] /= complete_metrics['num_match']

        if complete_metrics['precision']:
            complete_metrics['precision'] /= complete_metrics[
                'num_verif_samples']

        # Write result of inference
        module_to_write_to = self.MODULES['infer'][module_name]['write_to']

        self.upload_config_data(complete_metrics, module_to_write_to,
                                'infered_config.json')

        # Update log buffer
        self._end_active_log(log, error=False)

        return complete_metrics

# =============================================================================
# Elasticsearch
# =============================================================================

    def update_results(self, labels):
        '''Updates the merged table in Elasticsearch to take into account the
        new labels.
        '''
        # TODO: source indices

        new_rows = []
        columns = set()
        for label in labels:
            current_row = es.get(self.index_name, 'structure',
                                 label['source_id'])['_source']
            if label['is_match']:
                if current_row['__ID_REF'] != label['ref_id']:
                    new_ref = es.get(self.ref.project_id, 'structure',
                                     label['ref_id'])['_source']
                    new_ref = {
                        key + '__REF': val
                        for key, val in new_ref.items()
                    }
                    new_row = {key: val for key, val in current_row.items()}
                    new_row.update(new_ref)
                    new_row['__IS_MATCH'] = True
                    new_row['__CONFIDENCE'] = 999
                    new_row['__ID_REF'] = label['ref_id']

                    # TODO: what to do with __ES_SCORE, __ID_QUERY, __THRESH
                else:
                    new_row = {key: val for key, val in current_row.items()}
                    new_row['__IS_MATCH'] = True
                    new_row['__CONFIDENCE'] = 999
            else:
                new_row = {col: val for col, val in current_row.items()}

                nan_cols = list(filter(lambda x: x[-5:]=='__REF', new_row.keys())) \
                            + ['__CONFIDENCE', '__ES_SCORE', '__ID_QUERY', \
                               '__ID_REF', '__IS_MATCH', '__THRESH']

                for col in nan_cols:
                    new_row[col] = np.nan

            columns.update(new_row.keys())
            new_rows.append((label['source_id'], new_row))

        if new_rows:
            dtype = {col: self._choose_dtype(col) for col in columns}
            tab = pd.DataFrame([x[1] for x in new_rows],
                               index=[x[0] for x in new_rows])

            # Fix for dtype that is not working in DataFrame call
            for k, v in dtype.items():
                if v == str:
                    tab[k].fillna('', inplace=True)
                tab[k] = tab[k].astype(v)

            ref_gen = (x for x in [tab])
            self.update_index(ref_gen)

        # Dirty method to keep track of modifications
        file_name = self.metadata['log'].keys()
        assert len(file_name) == 1
        file_name = list(file_name)[0]
        self.metadata['log'][file_name]['upload_es_train'][
            'was_modified'] = True
        self._write_metadata()

Example #8

Show file

    def add_selected_project(self, file_role, public, project_id):
        '''
        Select file to use as source or referential.
        
        INPUT:
            - file_role: "source" or "referential"
            - public: (bool) is the project available to all (or is it a user project)
            - project_id
            - file_name
        '''
        self._check_file_role(file_role)
        # Check that file exists
        if public:
            raise DeprecationWarning
        else:
            proj = ESNormalizer(project_id)

        #        if file_name not in proj.metadata['files']:
        #            raise Exception('File {0} could not be found in project {1} \
        #                 (public: {2})'.format(file_name, project_id, public))

        # Check that normalization project has only one file (and possibly a MINI__ version)
        if not len(proj.metadata['files']):
            raise Exception(
                'The selected normalization project ({0}) has no upload file'.
                format(project_id))
        if len(proj.metadata['files']) > 1:
            raise Exception('The selected normalization project ({0}) has more than one file.'\
                    + ' This method expects projects to have exactly 1 file as it'\
                    + ' uses the implicit get_last_written'.format(project_id))

        # TODO: last written is a bad idea because if we modify normalization then BOOM !
        # TODO: last_written btw concat_with_initi and init ?
        (module_name, file_name) = proj.get_last_written()

        # TODO: add warning for implicit use of not-MINI
        if proj.metadata['has_mini'] and (file_role == 'source'):
            file_name = file_name.replace('MINI__', '')
        if proj.metadata['has_mini'] and (file_role == 'ref'):
            file_name = file_name.replace('MINI__', '')

        # Check that
        self.metadata['files'][file_role] = {
            'public': public,
            'project_id': project_id,
            'module_name': module_name,
            'file_name': file_name,
            'restricted': False
        }

        # Create log for source
        if file_role == 'source':
            self.metadata['log'][self.output_file_name(
                file_name)] = self._default_log()

        # Add project selection
        if (self.metadata['files']['source']
                is not None) and (self.metadata['files']['ref'] is not None):
            for file_name in self.metadata['log']:
                self.metadata['log'][file_name]['INIT']['completed'] = True
        self._write_metadata()
        self.load_project_to_merge(file_role)

Example #9

Show file

class ESLinker(Linker):
    def path_to(self, module_name='', file_name=''):
        return self._path_to(LINK_DATA_PATH, module_name, file_name)


if __name__ == '__main__':

    assert False

    source_file_name = 'source.csv'
    source_user_given_name = 'my_source.csv'
    ref_file_name = 'ref.csv'

    # Create source
    proj = ESNormalizer(None, create_new=True)
    source_proj_id = proj.project_id

    # Upload files to normalize
    file_path = os.path.join('local_test_data', source_file_name)
    with open(file_path, 'rb') as f:
        proj.upload_init_data(f, source_file_name, source_user_given_name)

    # Create ref
    proj = ESNormalizer(None, create_new=True)
    ref_proj_id = proj.project_id

    # Upload files to normalize
    file_path = os.path.join('local_test_data', ref_file_name)
    with open(file_path, 'rb') as f:
        proj.upload_init_data(f, ref_file_name, ref_file_name)

Example #10

Show file

class Linker(ESAbstractDataProject):
    MODULES = LINK_MODULES
    MODULE_ORDER = LINK_MODULE_ORDER
    MODULE_ORDER_log = LINK_MODULE_ORDER_log

    def __init__(self,
                 project_id=None,
                 create_new=False,
                 display_name=None,
                 description=None,
                 public=False):

        super().__init__(project_id,
                         create_new,
                         display_name=display_name,
                         description=description,
                         public=public)

        # Add source and ref if the were selected
        if (self.metadata['files']['source'] is not None) \
            and (self.metadata['files']['ref'] is not None):
            self.load_project_to_merge('source')
            self.load_project_to_merge('ref')

    def __repr__(self):
        string = '{0}({1})'.format(self.__class__.__name__, self.project_id)

        string += ' / source: '
        if self.source is not None:
            string += self.source.__repr__()
        else:
            string += 'None'

        string += ' / ref: '
        if self.ref is not None:
            string += self.ref.__repr__()
        return string

    def __str__(self):
        string = '{0}; project_id:{1}'.format(self.__class__.__name__,
                                              self.project_id)
        if self.source is not None:
            string += '\n\n***SOURCE***\n{0}'.format(self.source.__str__())
        if self.ref is not None:
            string += '\n\n***REF***\n{0}'.format(self.ref.__str__())
        return string

    @staticmethod
    def output_file_name(source_file_name):
        '''Name of the file to output'''
        return source_file_name

    def load_project_to_merge(self, file_role):
        '''Uses the "current" field in metadata to load source or ref'''
        self._check_file_role(file_role)
        # TODO: Add safeguard somewhere
        # Add source

        if file_role == 'source':
            try:
                self.source = ESNormalizer(
                    self.metadata['files']['source']['project_id'])
            except:
                self.source = None

        if file_role == 'ref':
            try:
                self.ref = ESNormalizer(
                    self.metadata['files']['ref']['project_id'])
            except:
                self.ref = None
            #raise Exception('Normalizer project with id {0} could not be found'.format(project_id))

    @staticmethod
    def _check_file_role(file_role):
        if file_role not in ['ref', 'source']:
            raise Exception('file_role should be either "source" or "ref"')

    def _check_select(self):
        '''Check that a source and referential were selected'''
        for file_role in ['source', 'ref']:
            if self.metadata['files'][file_role] is None:
                raise Exception(
                    '{0} is not defined for this linking project'.format(
                        file_role))

    def _create_metadata(self,
                         description=None,
                         display_name=None,
                         public=False):
        metadata = super()._create_metadata(description=description,
                                            display_name=display_name,
                                            public=public)
        metadata['files'] = {
            'source': None,
            'ref': None
        }  # {'source': {public: False, project_id: "ABC123", file_name: "source.csv.csv"}, 'ref': None}
        metadata['project_type'] = 'link'
        return metadata

    def add_col_matches(self, column_matches):
        '''
        Adds a configuration file with the column matches between source and
        referential.
        
        INPUT:
            - column_matches: json file as dict
        '''
        # TODO: add checks on file
        if (self.source is None) or (self.ref is None):
            raise RuntimeError(
                'source or referential were not loaded (add_selected_project) and/or (load_project_to_merge)'
            )

        # Add matches
        self.upload_config_data(column_matches, 'es_linker',
                                'column_matches.json')

        # Select these columns for normalization in source and ref

        # TODO: this will cover add_certain_col_matches
        # Add to log
        for file_name in self.metadata['log']:
            self.metadata['log'][file_name]['add_selected_columns'][
                'completed'] = True
        self._write_metadata()

    def add_es_learned_settings(self, learned_settings):
        '''Adds the learned es configuration'''

        print('trying to upload', learned_settings)

        self.upload_config_data(learned_settings, 'es_linker',
                                'learned_settings.json')

        for file_name in self.metadata['log']:
            self.metadata['log'][file_name]['upload_es_train'][
                'completed'] = True
        self._write_metadata()

    def read_col_matches(self, add_created=True):
        '''
        Read the column_matches config file and interprets the columns looking
        for processed (normalized) columns
        '''
        config = self.read_config_data('es_linker', 'column_matches.json')

        if not config:
            config = []

        return config

    def add_col_certain_matches(self, column_matches):
        '''column_matches is a json file as list of dict of list'''
        # TODO: add checks on file
        self.upload_config_data(column_matches, 'es_linker',
                                'column_certain_matches.json')

    def read_col_certain_matches(self):
        config = self.read_config_data('es_linker',
                                       'column_certain_matches.json')
        if not config:
            config = []
        return config

    def add_cols_to_return(self, file_role, columns):
        '''
        columns is a list of columns in the referential that we want to 
        return during download
        '''
        # Check that both projects are finished
        for file_role in ['source', 'ref']:
            file_name = self.metadata['files'][file_role]['file_name']
            if not self.__dict__[file_role].metadata['complete'][file_name]:
                raise Exception('Cannot select columns: complete {0} project \
                                ({1}) before...'.format(
                    file_role, self.__dict__[file_role].project_id))

        # Write columns to return to config
        config_file_name = 'columns_to_return_{0}.json'.format(file_role)
        self.upload_config_data(columns, 'es_linker', config_file_name)

    def read_cols_to_return(self, file_role):
        config_file_name = 'columns_to_return_{0}.json'.format(file_role)
        config = self.read_config_data('es_linker', config_file_name)
        if not config:
            config = []
        return config

    def add_selected_project(self, file_role, public, project_id):
        '''
        Select file to use as source or referential.
        
        INPUT:
            - file_role: "source" or "referential"
            - public: (bool) is the project available to all (or is it a user project)
            - project_id
            - file_name
        '''
        self._check_file_role(file_role)
        # Check that file exists
        if public:
            raise DeprecationWarning
        else:
            proj = ESNormalizer(project_id)

        #        if file_name not in proj.metadata['files']:
        #            raise Exception('File {0} could not be found in project {1} \
        #                 (public: {2})'.format(file_name, project_id, public))

        # Check that normalization project has only one file (and possibly a MINI__ version)
        if not len(proj.metadata['files']):
            raise Exception(
                'The selected normalization project ({0}) has no upload file'.
                format(project_id))
        if len(proj.metadata['files']) > 1:
            raise Exception('The selected normalization project ({0}) has more than one file.'\
                    + ' This method expects projects to have exactly 1 file as it'\
                    + ' uses the implicit get_last_written'.format(project_id))

        # TODO: last written is a bad idea because if we modify normalization then BOOM !
        # TODO: last_written btw concat_with_initi and init ?
        (module_name, file_name) = proj.get_last_written()

        # TODO: add warning for implicit use of not-MINI
        if proj.metadata['has_mini'] and (file_role == 'source'):
            file_name = file_name.replace('MINI__', '')
        if proj.metadata['has_mini'] and (file_role == 'ref'):
            file_name = file_name.replace('MINI__', '')

        # Check that
        self.metadata['files'][file_role] = {
            'public': public,
            'project_id': project_id,
            'module_name': module_name,
            'file_name': file_name,
            'restricted': False
        }

        # Create log for source
        if file_role == 'source':
            self.metadata['log'][self.output_file_name(
                file_name)] = self._default_log()

        # Add project selection
        if (self.metadata['files']['source']
                is not None) and (self.metadata['files']['ref'] is not None):
            for file_name in self.metadata['log']:
                self.metadata['log'][file_name]['INIT']['completed'] = True
        self._write_metadata()
        self.load_project_to_merge(file_role)

    def read_selected_files(self):
        '''
        Returns self.metadata['files']
        '''
        return self.metadata['files']

    def infer(self, module_name, params):
        '''Overwrite to allow restrict_reference'''
        if module_name == 'infer_restriction':
            params['NO_MEM_DATA'] = True
        return super().infer(module_name, params)

    def linker(self, module_name, data_params, module_params):
        '''Wrapper aro'''
        if module_name == 'es_linker':
            return self.es_linker(module_params)
        elif module_name == 'dedupe_linker':
            raise DeprecationWarning

    def es_linker(self, module_params):
        module_params['index_name'] = ESNormalizer(
            self.ref.project_id).index_name

        self.source.load_data(*self.source.get_last_written())
        self.mem_data = self.source.mem_data
        self.mem_data_info = self.source.mem_data_info

        # Change file_name to output file_name
        self.mem_data_info['file_name'] = self.output_file_name(
            self.mem_data_info['file_name'])  # File being modified

        log, run_info = self.transform('es_linker', module_params)

        return log, run_info

    def write_labeller(self, module_name, labeller):
        '''Pickles the labeller object in project'''
        # TODO: Add isinstance(labeller, Labeller)
        pickle_path = self.path_to(module_name, 'labeller.pkl')

        labeller.to_pickle(pickle_path)

    def _read_labeller(self, module_name):
        '''Reads labeller stored in pickle'''
        pickle_path = self.path_to(module_name, 'labeller.pkl')

        labeller = ESLabeller.from_pickle(pickle_path, es)
        return labeller

    #==========================================================================
    #  Module specific: ES Linker
    #==========================================================================

    def _gen_paths_es(self):
        self._check_select()

        # Get path to training file for ES linker
        training_path = self.path_to('es_linker', 'training.json')
        learned_settings_path = self.path_to('es_linker',
                                             'learned_settings.json')

        # TODO: check that normalization projects are complete ?

        # Get path to source
        # TODO: fix this: use current
        file_name = self.metadata['files']['source']['file_name']
        source_path = self.source.path_to_last_written(module_name=None,
                                                       file_name=file_name)

        # Add paths
        paths = {
            'source': source_path,
            'train': training_path,
            'learned_settings': learned_settings_path
        }
        return paths

    @staticmethod
    def _tuple_or_string(x):
        if isinstance(x, str):
            return x
        elif isinstance(x, list):
            if len(x) == 1:
                return x[0]
            else:
                return tuple(x)
        elif isinstance(x, tuple):
            if len(x) == 1:
                return x[0]
            else:
                return x
        else:
            raise ValueError('Value should be str, list or tuple')

    def _gen_es_labeller(self,
                         columns_to_index=None,
                         certain_column_matches=None):
        '''
        Return a es_labeller object
        '''
        self._check_select()

        #chunksize = 40000

        col_matches_tmp = self.read_col_matches()
        col_matches = []
        for match in col_matches_tmp:
            col_matches.append({
                'source': self._tuple_or_string(match['source']),
                'ref': self._tuple_or_string(match['ref'])
            })
        # TODO: lists to tuple in col_matches

        paths = self._gen_paths_es()
        source = pd.read_csv(paths['source'],
                             sep=',',
                             encoding='utf-8',
                             dtype=str,
                             nrows=3000)
        source = source.where(source.notnull(), '')

        ref_table_name = self.ref.project_id
        if columns_to_index is None:
            columns_to_index = self.ref.gen_default_columns_to_index()

        labeller = ESLabeller(es, source, ref_table_name, col_matches,
                              columns_to_index, certain_column_matches)

        # TODO: Auto label certain pairs

        # TODO: Add pre-load for 3 first queries

        return labeller

    def labeller_to_json(self, labeller):

        file_path = self.path_to('es_linker', 'labeller.json')
        labeller.to_json(file_path)

    def labeller_from_json(self):
        file_path = self.path_to('es_linker', 'labeller.json')

        paths = self._gen_paths_es()
        source = pd.read_csv(paths['source'],
                             sep=',',
                             encoding='utf-8',
                             dtype=str,
                             nrows=3000)
        source = source.where(source.notnull(), '')

        ref_table_name = self.ref.project_id
        labeller = ESLabeller.from_json(file_path, es, source, ref_table_name)

        return labeller

    def analyze_results(self, params={}):
        # Check that memory is loaded (if necessary)
        self._check_mem_data()

        module_name = 'link_results_analyzer'

        # Initiate log
        log = self._init_active_log(module_name, 'infer')

        agg_results = defaultdict(int)
        for data in self.mem_data:
            infered_params = link_results_analyzer(data, params)

            agg_results['num_match'] += infered_params['num_match']
            agg_results['num_match_thresh'] += infered_params[
                'num_match_thresh']

        # Write result of inference
        module_to_write_to = self.MODULES['infer'][module_name]['write_to']

        self.upload_config_data(agg_results, module_to_write_to,
                                'infered_config.json')

        # Update log buffer
        self._end_active_log(log, error=False)

        return infered_params