def _recode_types(project_id, data_params, module_params):
    '''
    Runs the recoding module
    
    ARGUMENTS (GET):
        project_id: ID for "normalize" project

    ARGUMENTS (POST):
        - data_params: {
                "module_name": module to fetch from
                "file_name": file to fetch
                }
        - module_params: same as result of infer_mvs
    '''
    proj = ESNormalizer(project_id=project_id)

    proj.load_data(data_params['module_name'], data_params['file_name'])

    _, run_info = proj.transform('recode_types', module_params)

    # Write transformations and logs
    proj.write_data()

    return run_info
def _run_all_transforms(project_id, data_params, *argv):
    '''
    Run all transformations that were already (based on presence of 
    run_info.json files) with parameters in run_info.json files.

    ARGUMENTS (GET):
        project_id: ID for "normalize" project

    ARGUMENTS (POST):
        - data_params: file to concatenate to original
                {
                    "file_name": file to use for transform (module_name is 'INIT')
                }
    '''
    proj = ESNormalizer(project_id=project_id)

    file_name = data_params['file_name']

    proj.load_data('INIT', file_name)
    all_run_infos = proj.run_all_transforms()

    # Write transformations and logs
    proj.write_data()
    return all_run_infos
def _infer_types(project_id, data_params, module_params):
    '''
    Runs the infer_types module
    
    wrapper around ESNormalizer.infer ?
    
    ARGUMENTS (GET):
        project_id: ID for "normalize" project

    ARGUMENTS (POST):
        - data_params: {
                "module_name": module to fetch from
                "file_name": file to fetch
                }
        - module_params: none
    
    '''
    proj = ESNormalizer(project_id=project_id)
    proj.load_data(data_params['module_name'], data_params['file_name'])
    result = proj.infer('infer_types', module_params)

    # Write log
    proj._write_log_buffer(False)
    return result
def upload(project_id):
    '''
    Uploads files to a normalization project. (NB: cannot upload directly to 
    a link type project). 
                                               
    Also creates the mini version of the project
    
    GET:
        - project_id: ID of the normalization project
        
    POST:
        
      file: (csv file) A csv to upload to the chosen normalization project
                  NB: the "filename" property will be used to name the file
      json:
        - module_params:
            - make_mini: (default True) Set to False to NOT create a mini version of the file
            - sample_size
            - randomize
    '''
    # Load project
    proj = ESNormalizer(project_id=project_id) 
    _, module_params = _parse_request()   
    if module_params is None:
        module_params = {}
    make_mini = module_params.get('make_mini', True) # TODO: can remove ?
    
    # Upload data        
    def custom_stream_factory(total_content_length, filename, content_type, content_length=None):
        tmpfile = tempfile.NamedTemporaryFile('wb+', prefix='flaskapp')
        app.logger.info("start receiving file ... filename => " + str(tmpfile.name))
        return tmpfile
    
    _, _, files = werkzeug.formparser.parse_form_data(flask.request.environ, stream_factory=custom_stream_factory)
    
    
    # Upload data
    file_name = files['file'].filename
    stream = files['file'].stream
    
    _, run_info = proj.upload_init_data(stream, file_name)
    
    # Make mini
    if make_mini:
        proj.load_data('INIT', run_info['file_name'])
        proj.make_mini(module_params)
        
        # Write transformations and log # TODO: not clean
        if proj.metadata['has_mini']:
            proj.write_data()
        else:
            proj._write_metadata()
    
    return jsonify(run_info=run_info, project_id=proj.project_id)
Example #5
0
def _create_es_index(project_id, data_params, module_params):
    '''
    Create an Elasticsearch index for the selected file
    
    GET:
        - project_id: Link project_id
    POST:
        - data_params: 
                        {
                            link_project_id: (optional) ID of the associated link project
                            project_type: (optional) defaults to link
                            module_name:
                            file_name: 
                        }
        - module_params: {
                            columns_to_index: 
                            for_linking: create index to use as referential (instead of storage)
                            force: force recreation of index even if existant
                        }
    '''

    if module_params is None:
        module_params = {}

    print(module_params)
    columns_to_index = module_params.get('columns_to_index')
    force = module_params.get('force', False)
    for_linking = module_params.get('for_linking', True)

    if (not for_linking) and (columns_to_index is not None):
        raise ValueError(
            'columns_to_index and for_linking cannot be NOT None and False')

    if (data_params is not None) and ('project_type' in data_params):
        project_type = data_params['project_type']

    project_type = 'link'
    if data_params is not None:
        module_name = data_params['module_name']
        file_name = data_params['file_name']
        project_type = data_params.get('project_type', 'link')

    # TODO: dirty fix for linking and normalization
    if for_linking:
        if project_type == 'link':
            proj_link = ESLinker(project_id)
            columns_to_index = proj_link.gen_default_columns_to_index()

            if data_params is None:
                module_name = proj_link.metadata['files']['ref']['module_name']
                file_name = proj_link.metadata['files']['ref']['file_name']

            proj = ESNormalizer(proj_link.ref.project_id)

        elif project_type == 'normalize':
            proj = ESNormalizer(project_id)
            assert columns_to_index is not None

    else:
        proj = ESLinker(project_id)
        if data_params is None:
            module_name, file_name = proj.get_last_written()

        # Type non str columns or use the default string analyzer
        types_dict = {float: 'float', bool: 'boolean', int: 'integer'}
        columns_to_index = {col: types_dict.get(proj._choose_dtype(col), {}) \
                            for col in proj._get_header(module_name, file_name)}

    file_path = proj.path_to(module_name, file_name)
    proj.create_index(file_path, columns_to_index, force,
                      proj.metadata.get('public', False))
    return
def _create_es_index(project_id, data_params, module_params):
    '''
    Create an Elasticsearch index for the selected file
    
    GET:
        - project_id: Link project_id
    POST:
        - data_params: 
                        {
                        project_type: (optional) defaults to link
                        module_name:
                        file_name: 
                        }
        - module_params: {
                            columns_to_index: 
                            for_linking: create index to use as referential (instead of storage)
                            force: force recreation of index even if existant
                        }
    '''

    if module_params is None:
        module_params = {}

    print(module_params)
    columns_to_index = module_params.get('columns_to_index')
    force = module_params.get('force', False)
    for_linking = module_params.get('for_linking', True)

    if (not for_linking) and (columns_to_index is not None):
        raise ValueError(
            'columns_to_index and for_linking cannot be not None and False')

    if (data_params is not None) and ('project_type' in data_params):
        project_type = data_params['project_type']

    project_type = 'link'
    if data_params is not None:
        module_name = data_params['module_name']
        file_name = data_params['file_name']
        project_type = data_params.get('project_type', 'link')

    # TODO: dirty fix for linking and normalization
    if for_linking:
        if project_type == 'link':
            proj_link = ESLinker(project_id)
            proj = ESNormalizer(proj_link.ref.project_id)

            if data_params is None:
                module_name = proj_link.metadata['files']['ref']['module_name']
                file_name = proj_link.metadata['files']['ref']['file_name']

        elif project_type == 'normalize':
            proj = ESNormalizer(project_id)

        # Generate default columns_to_index
        if columns_to_index is None:
            columns_to_index = proj.gen_default_columns_to_index(for_linking)

    else:
        proj = ESLinker(project_id)
        if data_params is None:
            module_name, file_name = proj.get_last_written()

        if columns_to_index is None:
            columns_to_index = {
                col: {}
                for col in proj._get_header(module_name, file_name)
            }

    file_path = proj.path_to(module_name, file_name)
    proj.create_index(file_path, columns_to_index, force)
    time.sleep(5)  # TODO: why is this necessary?
    return
Example #7
0
class Linker(ESAbstractDataProject):
    MODULES = LINK_MODULES
    MODULE_ORDER = LINK_MODULE_ORDER
    MODULE_ORDER_log = LINK_MODULE_ORDER_log

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Add source and ref if the were selected
        if (self.metadata['files']['source'] is not None) \
            and (self.metadata['files']['ref'] is not None):
            self.load_project_to_merge('source')
            self.load_project_to_merge('ref')

    def __repr__(self):
        string = '{0}({1})'.format(self.__class__.__name__, self.project_id)

        string += ' / source: '
        if self.source is not None:
            string += self.source.__repr__()
        else:
            string += 'None'

        string += ' / ref: '
        if self.ref is not None:
            string += self.ref.__repr__()
        return string

    def __str__(self):
        string = '{0}; project_id:{1}'.format(self.__class__.__name__,
                                              self.project_id)
        if self.source is not None:
            string += '\n\n***SOURCE***\n{0}'.format(self.source.__str__())
        if self.ref is not None:
            string += '\n\n***REF***\n{0}'.format(self.ref.__str__())
        return string

    @staticmethod
    def output_file_name(source_file_name):
        '''Name of the file to output'''
        return source_file_name

    def load_project_to_merge(self, file_role):
        '''Uses the "current" field in metadata to load source or ref'''
        self._check_file_role(file_role)
        # TODO: Add safeguard somewhere
        # Add source

        if file_role == 'source':
            try:
                self.source = ESNormalizer(
                    self.metadata['files']['source']['project_id'])
            except:
                self.source = None

        if file_role == 'ref':
            try:
                self.ref = ESNormalizer(
                    self.metadata['files']['ref']['project_id'])
            except:
                self.ref = None
            #raise Exception('Normalizer project with id {0} could not be found'.format(project_id))

    @staticmethod
    def _check_file_role(file_role):
        if file_role not in ['ref', 'source']:
            raise Exception('file_role should be either "source" or "ref"')

    def _check_select(self):
        '''Check that a source and referential were selected'''
        for file_role in ['source', 'ref']:
            if self.metadata['files'][file_role] is None:
                raise Exception(
                    '{0} is not defined for this linking project'.format(
                        file_role))

    def _create_metadata(self, *args, **kwargs):
        metadata = super()._create_metadata(*args, **kwargs)
        metadata['files'] = {'source': None, 'ref': None}
        metadata['project_type'] = 'link'
        return metadata

    def add_col_matches(self, column_matches):
        '''
        Adds a configuration file with the column matches between source and
        referential.
        
        INPUT:
            - column_matches: json file as dict
        '''

        # Remove labeller if it exists
        if self._has_labeller():
            self._remove_labeller()

        # TODO: add checks on file
        if (self.source is None) or (self.ref is None):
            raise RuntimeError(
                'source or referential were not loaded (add_selected_project) and/or (load_project_to_merge)'
            )

        # Remove duplicates from columns matches
        column_matches = [{'source': list(set(match['source'])),
                           'ref': list(set(match['ref'])),
                           'exact_only': match.get('exact_only', False)} \
                            for match in column_matches]

        # Remove matches with missing columns on one side or the othre
        column_matches = [match for match in column_matches \
                          if match['source'] and match['ref']]

        if not column_matches:
            raise ValueError("You have to specify at least one pair of columns" \
                             + " in column matches.")

        # Add matches
        self.upload_config_data(column_matches, 'es_linker',
                                'column_matches.json')

        # Select these columns for normalization in source and ref

        # TODO: this will cover add_certain_col_matches
        # Add to log
        for file_name in self.metadata['log']:
            self.metadata['log'][file_name]['add_selected_columns'][
                'completed'] = True
        self._write_metadata()

    def add_es_learned_settings(self, learned_settings):
        '''Adds the learned es configuration'''

        print('trying to upload', learned_settings)

        # TODO: figure out where to move this
        learned_settings['best_thresh'] = 1

        self.upload_config_data(learned_settings, 'es_linker',
                                'learned_settings.json')

        for file_name in self.metadata['log']:
            self.metadata['log'][file_name]['upload_es_train'][
                'completed'] = True
        self._write_metadata()

    def read_col_matches(self, add_created=True):
        '''
        Read the column_matches config file and interprets the columns looking
        for processed (normalized) columns
        '''
        config = self.read_config_data('es_linker', 'column_matches.json')

        if not config:
            config = []

        return config

    def add_col_certain_matches(self, column_matches):
        '''column_matches is a json file as list of dict of list'''
        # TODO: add checks on file
        self.upload_config_data(column_matches, 'es_linker',
                                'column_certain_matches.json')

    def read_col_certain_matches(self):
        config = self.read_config_data('es_linker',
                                       'column_certain_matches.json')
        if not config:
            config = []
        return config

    def read_cols_to_return(self, file_role):
        config_file_name = 'columns_to_return_{0}.json'.format(file_role)
        config = self.read_config_data('es_linker', config_file_name)
        if not config:
            config = []
        return config

    def add_selected_project(self, file_role, public, project_id):
        '''
        Select file to use as source or referential.
        
        INPUT:
            - file_role: "source" or "referential"
            - public: (bool) is the project available to all (or is it a user project)
            - project_id
            - file_name
        '''
        self._check_file_role(file_role)
        # Check that file exists
        if public:
            raise DeprecationWarning
        else:
            proj = ESNormalizer(project_id)

        #        if file_name not in proj.metadata['files']:
        #            raise Exception('File {0} could not be found in project {1} \
        #                 (public: {2})'.format(file_name, project_id, public))

        # Check that normalization project has only one file (and possibly a MINI__ version)
        if not len(proj.metadata['files']):
            raise Exception(
                'The selected normalization project ({0}) has no upload file'.
                format(project_id))
        if len(proj.metadata['files']) > 1:
            raise Exception('The selected normalization project ({0}) has more than one file.'\
                    + ' This method expects projects to have exactly 1 file as it'\
                    + ' uses the implicit get_last_written'.format(project_id))

        # TODO: last written is a bad idea because if we modify normalization then BOOM !
        # TODO: last_written btw concat_with_initi and init ?
        (module_name, file_name) = proj.get_last_written()

        # TODO: add warning for implicit use of not-MINI
        if proj.metadata['has_mini'] and (file_role == 'source'):
            file_name = file_name.replace('MINI__', '')
        if proj.metadata['has_mini'] and (file_role == 'ref'):
            file_name = file_name.replace('MINI__', '')

        # Check that
        self.metadata['files'][file_role] = {
            'public': public,
            'project_id': project_id,
            'module_name': module_name,
            'file_name': file_name,
            'restricted': False
        }

        # Create log for source
        if file_role == 'source':
            self.metadata['log'][self.output_file_name(
                file_name)] = self._default_log()

        # Add project selection
        if (self.metadata['files']['source']
                is not None) and (self.metadata['files']['ref'] is not None):
            for file_name in self.metadata['log']:
                self.metadata['log'][file_name]['INIT']['completed'] = True
        self._write_metadata()
        self.load_project_to_merge(file_role)

    def read_selected_files(self):
        '''
        Returns self.metadata['files']
        '''
        return self.metadata['files']

    def infer(self, module_name, params):
        '''Overwrite to allow restrict_reference'''
        if module_name == 'infer_restriction':
            params['NO_MEM_DATA'] = True
        return super().infer(module_name, params)

    def linker(self, module_name, data_params, module_params):
        '''Wrapper around link methods.'''

        if module_name == 'es_linker':
            return self.es_linker(module_params)
        elif module_name == 'dedupe_linker':
            raise DeprecationWarning

    def es_linker(self, module_params):
        module_params['index_name'] = ESNormalizer(
            self.ref.project_id).index_name

        s = self.metadata['files']['source']

        self.source.load_data(s['module_name'], s['file_name'])

        self.mem_data = self.source.mem_data
        self.mem_data_info = self.source.mem_data_info

        # Change file_name to output file_name
        self.mem_data_info['file_name'] = self.output_file_name(
            self.mem_data_info['file_name'])  # File being modified

        log, run_info = self.transform('es_linker', module_params)

        #print('DEF:', self.mem_data.columns)
        return log, run_info

    #==========================================================================
    #  Module specific: ES Linker
    #==========================================================================

    def _gen_paths_es(self):
        self._check_select()

        # Get path to training file for ES linker
        training_path = self.path_to('es_linker', 'training.json')
        learned_settings_path = self.path_to('es_linker',
                                             'learned_settings.json')

        # TODO: check that normalization projects are complete ?

        # Get path to source
        # TODO: fix this: use current
        file_name = self.metadata['files']['source']['file_name']
        source_path = self.source.path_to_last_written(module_name=None,
                                                       file_name=file_name)

        # Add paths
        paths = {
            'source': source_path,
            'train': training_path,
            'learned_settings': learned_settings_path
        }
        return paths

    @staticmethod
    def _tuple_or_string(x):
        if isinstance(x, str):
            return x
        elif isinstance(x, list):
            if len(x) == 1:
                return x[0]
            else:
                return tuple(x)
        elif isinstance(x, tuple):
            if len(x) == 1:
                return x[0]
            else:
                return x
        else:
            raise ValueError('Value should be str, list or tuple')

    def gen_default_columns_to_index(self):
        '''Generate the dict specifying the analyzers to use for each column 
        while indexing in Elasticsearch. 
        
        This method only takes into account the reference file as to avoid 
        re-indexing when using the same reference with a different source. This 
        could change if partial re-indexing is implemented.
            
        Returns
        -------
        columns_to_index: dict associating sets of str (values) to str (keys)
            A dict indicating what Elasticsearch analyzers to use on each column
            type during indexing.
        '''
        INDEX_ALL = False  # Whether or not to index all selected columns of the file

        def temp(column_types, col):
            """Return the type specific default analyzer for a column or return 
            all default analyzers if type is not specified or could not be found.
            """
            return DEFAULT_ANALYZERS_TYPE.get(column_types.get(col),
                                              DEFAULT_ANALYZERS)

        # Try fetching referential column types
        # TODO: dangerous if config was not confirmed by user...
        column_types = self.ref.read_config_data('recode_types',
                                                 'infered_config.json')

        # Read column match data
        column_matches = self.read_config_data('es_linker',
                                               'column_matches.json')
        if not column_matches:
            raise RuntimeError('No column matches to read from')

        # Add default analyzer for columns that are exact matches

        if INDEX_ALL:
            list_of_columns_exact = self.ref.metadata['column_tracker'][
                'selected']
            list_of_columns_exact = {
                x
                for x in list_of_columns_exact if '__' not in x
            }
        else:
            exact_matches = filter(lambda m: m.get('exact_only', False),
                                   column_matches)
            list_of_columns_exact = {y for z in [[m['ref']] if isinstance(m['ref'], str) \
                                    else m['ref'] for m in exact_matches] for y in z}

        columns_to_index = {col: {} for col in list_of_columns_exact}

        # Add analyzers for columns that are non-exact matches
        # NB: Preserve order to not overwrite columns_to_index of non-exact
        non_exact_matches = filter(lambda m: not m.get('exact_only', False),
                                   column_matches)
        list_of_columns_non_exact = {y for z in [[m['ref']] if isinstance(m['ref'], str) \
                                else m['ref'] for m in non_exact_matches] for y in z}
        columns_to_index.update({
            col: temp(column_types, col)
            for col in list_of_columns_non_exact
        })

        # Add all columns that were selected
        for col in self.ref.metadata['column_tracker']['selected']:
            columns_to_index.setdefault(col, {})

        print('columns_to_index:')
        print(columns_to_index)

        return columns_to_index

    def _gen_es_labeller(self,
                         columns_to_index=None,
                         certain_column_matches=None):
        '''Return a es_labeller object.
        '''
        self._check_select()

        #chunksize = 40000

        col_matches_tmp = self.read_col_matches()
        col_matches = []
        for match in col_matches_tmp:
            col_matches.append({
                'source': self._tuple_or_string(match['source']),
                'ref': self._tuple_or_string(match['ref'])
            })
        # TODO: lists to tuple in col_matches

        paths = self._gen_paths_es()
        source = pd.read_csv(paths['source'],
                             sep=',',
                             encoding='utf-8',
                             dtype=str,
                             nrows=3000)
        source = source.where(source.notnull(), '')

        ref_table_name = self.ref.project_id
        if columns_to_index is None:
            columns_to_index = self.gen_default_columns_to_index()

        print(columns_to_index)

        # TODO: Check that reference is indexed
        # TODO: Restrict columns to index to columns present in reference.

        labeller = ESLabeller(es, source, ref_table_name, col_matches,
                              columns_to_index, certain_column_matches)

        # TODO: Auto label certain pairs

        # TODO: Add pre-load for 3 first queries

        return labeller

    def _has_labeller(self):
        '''Check for json of labeller.'''
        file_path = self.path_to('es_linker', 'labeller.json')
        return os.path.isfile(file_path)

    def _remove_labeller(self):
        '''Remove json version of labeller.'''
        if self._has_labeller():
            self._remove('es_linker', 'labeller.json')

    def labeller_to_json(self, labeller):
        '''Write a Labeller object as a json in the appropriate directory. This
        includes a locking logic to avoid concurrent writes.
        '''
        NUM_RETRY = 10
        RETRY_INTERVAL = 0.1

        file_path = self.path_to('es_linker', 'labeller.json')

        for _ in range(NUM_RETRY):
            try:
                # Lock File before writing
                with open(file_path, 'a') as f:
                    fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)

                # Write file
                labeller.to_json(file_path)

                # Unlock file
                with open(file_path, 'r') as w:
                    fcntl.flock(w, fcntl.LOCK_UN)
                break

            except BlockingIOError:
                time.sleep(RETRY_INTERVAL)
        else:
            raise BlockingIOError('{0} is un-writable because '.format(file_path) \
                                + 'it was locked for by another process.')

    def labeller_from_json(self):
        file_path = self.path_to('es_linker', 'labeller.json')

        paths = self._gen_paths_es()
        source = pd.read_csv(paths['source'],
                             sep=',',
                             encoding='utf-8',
                             dtype=str,
                             nrows=3000)
        source = source.where(source.notnull(), '')

        ref_table_name = self.ref.project_id
        labeller = ESLabeller.from_json(file_path, es, source, ref_table_name)

        return labeller

    def analyze_results(self, params={}):
        # Check that memory is loaded (if necessary)
        self._check_mem_data()

        module_name = 'link_results_analyzer'

        # Initiate log
        log = self._init_active_log(module_name, 'infer')

        complete_metrics = defaultdict(int)

        for data in self.mem_data:
            metrics = link_results_analyzer(data, params)

            for col in ['num_match_thresh', 'num_match', 'num_verif_samples']:
                complete_metrics[col] += metrics[col]

            # Weigh ratios according to the number of samples (we divide after)
            complete_metrics['perc_match_thresh'] += metrics[
                'perc_match_thresh'] * metrics['num_match_thresh']
            complete_metrics[
                'perc_match'] += metrics['perc_match'] * metrics['num_match']
            complete_metrics['precision'] += metrics.get(
                'precision', 0) * metrics['num_verif_samples']

        if complete_metrics['num_match_thresh']:
            complete_metrics['perc_match_thresh'] /= complete_metrics[
                'num_match_thresh']

        if complete_metrics['num_match']:
            complete_metrics['perc_match'] /= complete_metrics['num_match']

        if complete_metrics['precision']:
            complete_metrics['precision'] /= complete_metrics[
                'num_verif_samples']

        # Write result of inference
        module_to_write_to = self.MODULES['infer'][module_name]['write_to']

        self.upload_config_data(complete_metrics, module_to_write_to,
                                'infered_config.json')

        # Update log buffer
        self._end_active_log(log, error=False)

        return complete_metrics

# =============================================================================
# Elasticsearch
# =============================================================================

    def update_results(self, labels):
        '''Updates the merged table in Elasticsearch to take into account the
        new labels.
        '''
        # TODO: source indices

        new_rows = []
        columns = set()
        for label in labels:
            current_row = es.get(self.index_name, 'structure',
                                 label['source_id'])['_source']
            if label['is_match']:
                if current_row['__ID_REF'] != label['ref_id']:
                    new_ref = es.get(self.ref.project_id, 'structure',
                                     label['ref_id'])['_source']
                    new_ref = {
                        key + '__REF': val
                        for key, val in new_ref.items()
                    }
                    new_row = {key: val for key, val in current_row.items()}
                    new_row.update(new_ref)
                    new_row['__IS_MATCH'] = True
                    new_row['__CONFIDENCE'] = 999
                    new_row['__ID_REF'] = label['ref_id']

                    # TODO: what to do with __ES_SCORE, __ID_QUERY, __THRESH
                else:
                    new_row = {key: val for key, val in current_row.items()}
                    new_row['__IS_MATCH'] = True
                    new_row['__CONFIDENCE'] = 999
            else:
                new_row = {col: val for col, val in current_row.items()}

                nan_cols = list(filter(lambda x: x[-5:]=='__REF', new_row.keys())) \
                            + ['__CONFIDENCE', '__ES_SCORE', '__ID_QUERY', \
                               '__ID_REF', '__IS_MATCH', '__THRESH']

                for col in nan_cols:
                    new_row[col] = np.nan

            columns.update(new_row.keys())
            new_rows.append((label['source_id'], new_row))

        if new_rows:
            dtype = {col: self._choose_dtype(col) for col in columns}
            tab = pd.DataFrame([x[1] for x in new_rows],
                               index=[x[0] for x in new_rows])

            # Fix for dtype that is not working in DataFrame call
            for k, v in dtype.items():
                if v == str:
                    tab[k].fillna('', inplace=True)
                tab[k] = tab[k].astype(v)

            ref_gen = (x for x in [tab])
            self.update_index(ref_gen)

        # Dirty method to keep track of modifications
        file_name = self.metadata['log'].keys()
        assert len(file_name) == 1
        file_name = list(file_name)[0]
        self.metadata['log'][file_name]['upload_es_train'][
            'was_modified'] = True
        self._write_metadata()
Example #8
0
    def add_selected_project(self, file_role, public, project_id):
        '''
        Select file to use as source or referential.
        
        INPUT:
            - file_role: "source" or "referential"
            - public: (bool) is the project available to all (or is it a user project)
            - project_id
            - file_name
        '''
        self._check_file_role(file_role)
        # Check that file exists
        if public:
            raise DeprecationWarning
        else:
            proj = ESNormalizer(project_id)

        #        if file_name not in proj.metadata['files']:
        #            raise Exception('File {0} could not be found in project {1} \
        #                 (public: {2})'.format(file_name, project_id, public))

        # Check that normalization project has only one file (and possibly a MINI__ version)
        if not len(proj.metadata['files']):
            raise Exception(
                'The selected normalization project ({0}) has no upload file'.
                format(project_id))
        if len(proj.metadata['files']) > 1:
            raise Exception('The selected normalization project ({0}) has more than one file.'\
                    + ' This method expects projects to have exactly 1 file as it'\
                    + ' uses the implicit get_last_written'.format(project_id))

        # TODO: last written is a bad idea because if we modify normalization then BOOM !
        # TODO: last_written btw concat_with_initi and init ?
        (module_name, file_name) = proj.get_last_written()

        # TODO: add warning for implicit use of not-MINI
        if proj.metadata['has_mini'] and (file_role == 'source'):
            file_name = file_name.replace('MINI__', '')
        if proj.metadata['has_mini'] and (file_role == 'ref'):
            file_name = file_name.replace('MINI__', '')

        # Check that
        self.metadata['files'][file_role] = {
            'public': public,
            'project_id': project_id,
            'module_name': module_name,
            'file_name': file_name,
            'restricted': False
        }

        # Create log for source
        if file_role == 'source':
            self.metadata['log'][self.output_file_name(
                file_name)] = self._default_log()

        # Add project selection
        if (self.metadata['files']['source']
                is not None) and (self.metadata['files']['ref'] is not None):
            for file_name in self.metadata['log']:
                self.metadata['log'][file_name]['INIT']['completed'] = True
        self._write_metadata()
        self.load_project_to_merge(file_role)
Example #9
0
class ESLinker(Linker):
    def path_to(self, module_name='', file_name=''):
        return self._path_to(LINK_DATA_PATH, module_name, file_name)


if __name__ == '__main__':

    assert False

    source_file_name = 'source.csv'
    source_user_given_name = 'my_source.csv'
    ref_file_name = 'ref.csv'

    # Create source
    proj = ESNormalizer(None, create_new=True)
    source_proj_id = proj.project_id

    # Upload files to normalize
    file_path = os.path.join('local_test_data', source_file_name)
    with open(file_path, 'rb') as f:
        proj.upload_init_data(f, source_file_name, source_user_given_name)

    # Create ref
    proj = ESNormalizer(None, create_new=True)
    ref_proj_id = proj.project_id

    # Upload files to normalize
    file_path = os.path.join('local_test_data', ref_file_name)
    with open(file_path, 'rb') as f:
        proj.upload_init_data(f, ref_file_name, ref_file_name)
Example #10
0
class Linker(ESAbstractDataProject):
    MODULES = LINK_MODULES
    MODULE_ORDER = LINK_MODULE_ORDER
    MODULE_ORDER_log = LINK_MODULE_ORDER_log

    def __init__(self,
                 project_id=None,
                 create_new=False,
                 display_name=None,
                 description=None,
                 public=False):

        super().__init__(project_id,
                         create_new,
                         display_name=display_name,
                         description=description,
                         public=public)

        # Add source and ref if the were selected
        if (self.metadata['files']['source'] is not None) \
            and (self.metadata['files']['ref'] is not None):
            self.load_project_to_merge('source')
            self.load_project_to_merge('ref')

    def __repr__(self):
        string = '{0}({1})'.format(self.__class__.__name__, self.project_id)

        string += ' / source: '
        if self.source is not None:
            string += self.source.__repr__()
        else:
            string += 'None'

        string += ' / ref: '
        if self.ref is not None:
            string += self.ref.__repr__()
        return string

    def __str__(self):
        string = '{0}; project_id:{1}'.format(self.__class__.__name__,
                                              self.project_id)
        if self.source is not None:
            string += '\n\n***SOURCE***\n{0}'.format(self.source.__str__())
        if self.ref is not None:
            string += '\n\n***REF***\n{0}'.format(self.ref.__str__())
        return string

    @staticmethod
    def output_file_name(source_file_name):
        '''Name of the file to output'''
        return source_file_name

    def load_project_to_merge(self, file_role):
        '''Uses the "current" field in metadata to load source or ref'''
        self._check_file_role(file_role)
        # TODO: Add safeguard somewhere
        # Add source

        if file_role == 'source':
            try:
                self.source = ESNormalizer(
                    self.metadata['files']['source']['project_id'])
            except:
                self.source = None

        if file_role == 'ref':
            try:
                self.ref = ESNormalizer(
                    self.metadata['files']['ref']['project_id'])
            except:
                self.ref = None
            #raise Exception('Normalizer project with id {0} could not be found'.format(project_id))

    @staticmethod
    def _check_file_role(file_role):
        if file_role not in ['ref', 'source']:
            raise Exception('file_role should be either "source" or "ref"')

    def _check_select(self):
        '''Check that a source and referential were selected'''
        for file_role in ['source', 'ref']:
            if self.metadata['files'][file_role] is None:
                raise Exception(
                    '{0} is not defined for this linking project'.format(
                        file_role))

    def _create_metadata(self,
                         description=None,
                         display_name=None,
                         public=False):
        metadata = super()._create_metadata(description=description,
                                            display_name=display_name,
                                            public=public)
        metadata['files'] = {
            'source': None,
            'ref': None
        }  # {'source': {public: False, project_id: "ABC123", file_name: "source.csv.csv"}, 'ref': None}
        metadata['project_type'] = 'link'
        return metadata

    def add_col_matches(self, column_matches):
        '''
        Adds a configuration file with the column matches between source and
        referential.
        
        INPUT:
            - column_matches: json file as dict
        '''
        # TODO: add checks on file
        if (self.source is None) or (self.ref is None):
            raise RuntimeError(
                'source or referential were not loaded (add_selected_project) and/or (load_project_to_merge)'
            )

        # Add matches
        self.upload_config_data(column_matches, 'es_linker',
                                'column_matches.json')

        # Select these columns for normalization in source and ref

        # TODO: this will cover add_certain_col_matches
        # Add to log
        for file_name in self.metadata['log']:
            self.metadata['log'][file_name]['add_selected_columns'][
                'completed'] = True
        self._write_metadata()

    def add_es_learned_settings(self, learned_settings):
        '''Adds the learned es configuration'''

        print('trying to upload', learned_settings)

        self.upload_config_data(learned_settings, 'es_linker',
                                'learned_settings.json')

        for file_name in self.metadata['log']:
            self.metadata['log'][file_name]['upload_es_train'][
                'completed'] = True
        self._write_metadata()

    def read_col_matches(self, add_created=True):
        '''
        Read the column_matches config file and interprets the columns looking
        for processed (normalized) columns
        '''
        config = self.read_config_data('es_linker', 'column_matches.json')

        if not config:
            config = []

        return config

    def add_col_certain_matches(self, column_matches):
        '''column_matches is a json file as list of dict of list'''
        # TODO: add checks on file
        self.upload_config_data(column_matches, 'es_linker',
                                'column_certain_matches.json')

    def read_col_certain_matches(self):
        config = self.read_config_data('es_linker',
                                       'column_certain_matches.json')
        if not config:
            config = []
        return config

    def add_cols_to_return(self, file_role, columns):
        '''
        columns is a list of columns in the referential that we want to 
        return during download
        '''
        # Check that both projects are finished
        for file_role in ['source', 'ref']:
            file_name = self.metadata['files'][file_role]['file_name']
            if not self.__dict__[file_role].metadata['complete'][file_name]:
                raise Exception('Cannot select columns: complete {0} project \
                                ({1}) before...'.format(
                    file_role, self.__dict__[file_role].project_id))

        # Write columns to return to config
        config_file_name = 'columns_to_return_{0}.json'.format(file_role)
        self.upload_config_data(columns, 'es_linker', config_file_name)

    def read_cols_to_return(self, file_role):
        config_file_name = 'columns_to_return_{0}.json'.format(file_role)
        config = self.read_config_data('es_linker', config_file_name)
        if not config:
            config = []
        return config

    def add_selected_project(self, file_role, public, project_id):
        '''
        Select file to use as source or referential.
        
        INPUT:
            - file_role: "source" or "referential"
            - public: (bool) is the project available to all (or is it a user project)
            - project_id
            - file_name
        '''
        self._check_file_role(file_role)
        # Check that file exists
        if public:
            raise DeprecationWarning
        else:
            proj = ESNormalizer(project_id)

        #        if file_name not in proj.metadata['files']:
        #            raise Exception('File {0} could not be found in project {1} \
        #                 (public: {2})'.format(file_name, project_id, public))

        # Check that normalization project has only one file (and possibly a MINI__ version)
        if not len(proj.metadata['files']):
            raise Exception(
                'The selected normalization project ({0}) has no upload file'.
                format(project_id))
        if len(proj.metadata['files']) > 1:
            raise Exception('The selected normalization project ({0}) has more than one file.'\
                    + ' This method expects projects to have exactly 1 file as it'\
                    + ' uses the implicit get_last_written'.format(project_id))

        # TODO: last written is a bad idea because if we modify normalization then BOOM !
        # TODO: last_written btw concat_with_initi and init ?
        (module_name, file_name) = proj.get_last_written()

        # TODO: add warning for implicit use of not-MINI
        if proj.metadata['has_mini'] and (file_role == 'source'):
            file_name = file_name.replace('MINI__', '')
        if proj.metadata['has_mini'] and (file_role == 'ref'):
            file_name = file_name.replace('MINI__', '')

        # Check that
        self.metadata['files'][file_role] = {
            'public': public,
            'project_id': project_id,
            'module_name': module_name,
            'file_name': file_name,
            'restricted': False
        }

        # Create log for source
        if file_role == 'source':
            self.metadata['log'][self.output_file_name(
                file_name)] = self._default_log()

        # Add project selection
        if (self.metadata['files']['source']
                is not None) and (self.metadata['files']['ref'] is not None):
            for file_name in self.metadata['log']:
                self.metadata['log'][file_name]['INIT']['completed'] = True
        self._write_metadata()
        self.load_project_to_merge(file_role)

    def read_selected_files(self):
        '''
        Returns self.metadata['files']
        '''
        return self.metadata['files']

    def infer(self, module_name, params):
        '''Overwrite to allow restrict_reference'''
        if module_name == 'infer_restriction':
            params['NO_MEM_DATA'] = True
        return super().infer(module_name, params)

    def linker(self, module_name, data_params, module_params):
        '''Wrapper aro'''
        if module_name == 'es_linker':
            return self.es_linker(module_params)
        elif module_name == 'dedupe_linker':
            raise DeprecationWarning

    def es_linker(self, module_params):
        module_params['index_name'] = ESNormalizer(
            self.ref.project_id).index_name

        self.source.load_data(*self.source.get_last_written())
        self.mem_data = self.source.mem_data
        self.mem_data_info = self.source.mem_data_info

        # Change file_name to output file_name
        self.mem_data_info['file_name'] = self.output_file_name(
            self.mem_data_info['file_name'])  # File being modified

        log, run_info = self.transform('es_linker', module_params)

        return log, run_info

    def write_labeller(self, module_name, labeller):
        '''Pickles the labeller object in project'''
        # TODO: Add isinstance(labeller, Labeller)
        pickle_path = self.path_to(module_name, 'labeller.pkl')

        labeller.to_pickle(pickle_path)

    def _read_labeller(self, module_name):
        '''Reads labeller stored in pickle'''
        pickle_path = self.path_to(module_name, 'labeller.pkl')

        labeller = ESLabeller.from_pickle(pickle_path, es)
        return labeller

    #==========================================================================
    #  Module specific: ES Linker
    #==========================================================================

    def _gen_paths_es(self):
        self._check_select()

        # Get path to training file for ES linker
        training_path = self.path_to('es_linker', 'training.json')
        learned_settings_path = self.path_to('es_linker',
                                             'learned_settings.json')

        # TODO: check that normalization projects are complete ?

        # Get path to source
        # TODO: fix this: use current
        file_name = self.metadata['files']['source']['file_name']
        source_path = self.source.path_to_last_written(module_name=None,
                                                       file_name=file_name)

        # Add paths
        paths = {
            'source': source_path,
            'train': training_path,
            'learned_settings': learned_settings_path
        }
        return paths

    @staticmethod
    def _tuple_or_string(x):
        if isinstance(x, str):
            return x
        elif isinstance(x, list):
            if len(x) == 1:
                return x[0]
            else:
                return tuple(x)
        elif isinstance(x, tuple):
            if len(x) == 1:
                return x[0]
            else:
                return x
        else:
            raise ValueError('Value should be str, list or tuple')

    def _gen_es_labeller(self,
                         columns_to_index=None,
                         certain_column_matches=None):
        '''
        Return a es_labeller object
        '''
        self._check_select()

        #chunksize = 40000

        col_matches_tmp = self.read_col_matches()
        col_matches = []
        for match in col_matches_tmp:
            col_matches.append({
                'source': self._tuple_or_string(match['source']),
                'ref': self._tuple_or_string(match['ref'])
            })
        # TODO: lists to tuple in col_matches

        paths = self._gen_paths_es()
        source = pd.read_csv(paths['source'],
                             sep=',',
                             encoding='utf-8',
                             dtype=str,
                             nrows=3000)
        source = source.where(source.notnull(), '')

        ref_table_name = self.ref.project_id
        if columns_to_index is None:
            columns_to_index = self.ref.gen_default_columns_to_index()

        labeller = ESLabeller(es, source, ref_table_name, col_matches,
                              columns_to_index, certain_column_matches)

        # TODO: Auto label certain pairs

        # TODO: Add pre-load for 3 first queries

        return labeller

    def labeller_to_json(self, labeller):

        file_path = self.path_to('es_linker', 'labeller.json')
        labeller.to_json(file_path)

    def labeller_from_json(self):
        file_path = self.path_to('es_linker', 'labeller.json')

        paths = self._gen_paths_es()
        source = pd.read_csv(paths['source'],
                             sep=',',
                             encoding='utf-8',
                             dtype=str,
                             nrows=3000)
        source = source.where(source.notnull(), '')

        ref_table_name = self.ref.project_id
        labeller = ESLabeller.from_json(file_path, es, source, ref_table_name)

        return labeller

    def analyze_results(self, params={}):
        # Check that memory is loaded (if necessary)
        self._check_mem_data()

        module_name = 'link_results_analyzer'

        # Initiate log
        log = self._init_active_log(module_name, 'infer')

        agg_results = defaultdict(int)
        for data in self.mem_data:
            infered_params = link_results_analyzer(data, params)

            agg_results['num_match'] += infered_params['num_match']
            agg_results['num_match_thresh'] += infered_params[
                'num_match_thresh']

        # Write result of inference
        module_to_write_to = self.MODULES['infer'][module_name]['write_to']

        self.upload_config_data(agg_results, module_to_write_to,
                                'infered_config.json')

        # Update log buffer
        self._end_active_log(log, error=False)

        return infered_params