Example #1
0
    def save(self):
        logger.debug('>>--- save --->>')

        left_index = 'LEFT_' + self.left_index
        right_index = 'RIGHT_' + self.right_index
        left_entity_id = 'LEFT_' + self.left_entity
        right_entity_id = 'RIGHT_' + self.right_entity
        grouped = self.matched_not_linked.reset_index().groupby(
            [left_index, right_index, left_entity_id,
             right_entity_id]).agg({'STEP': 'min'})

        self.matched_not_linked = pd.DataFrame(grouped)

        # Storing linked data records.
        logger.info(
            "Preparing output files of the linking project %s with tsk id %s.",
            self.project['name'], self.project['task_uuid'])
        linked_file_path = self.project['output_root'] + link_config.get(
            'linked_data_file', 'linked_data.csv')

        self.linked['STEP'] = self.linked['STEP'].map(
            lambda x: '{:.0f}'.format(x) if pd.notnull(x) else np.nan)

        self.linked[right_entity_id] = self.linked[right_entity_id].map(
            lambda x: '{:.0f}'.format(x) if pd.notnull(x) else np.nan)

        self.linked[left_entity_id] = self.linked[left_entity_id].map(
            lambda x: '{:.0f}'.format(x) if pd.notnull(x) else np.nan)

        if np.issubdtype(self.left_index_type, np.integer):
            self.linked[left_index] = self.linked[left_index].map(
                lambda x: '{:.0f}'.format(x) if pd.notnull(x) else np.nan)

        if np.issubdtype(self.right_index_type, np.integer):
            self.linked[right_index] = self.linked[right_index].map(
                lambda x: '{:.0f}'.format(x) if pd.notnull(x) else np.nan)

        self.linked = self.linked.sort_values(['LINK_ID'])
        self.linked.replace(np.nan, '', regex=True)
        self.linked.to_csv(linked_file_path, index=False)

        # Storing matched but not linked records.
        matched_file_path = self.project['output_root'] + link_config.get(
            'matched_not_linked_filename', 'matched_not_linked_data.csv')
        self.matched_not_linked.replace(np.nan, '', regex=True)
        self.matched_not_linked.to_csv(matched_file_path)

        logger.info('Linking output files generated: %s,\n %s.',
                    linked_file_path, matched_file_path)

        # Clean all remaining temp files
        if os.path.exists(self.temp_path):
            shutil.rmtree(self.temp_path)

        logger.debug('<<--- save ---<<')
        return generate_linking_summary(self, self.project['output_root'])
Example #2
0
    def save(self):
        """
        Generates the de-duplicated file.
        :return:
        """
        logger.debug('>>--- save --->>')

        # Assign entity id to all remaining records.
        logger.info('Assigning entity id to all remaining records.')
        for rec_id in self.left_dataset.index.values:
            self.left_dataset.set_value(rec_id, 'ENTITY_ID',
                                        MemoryLinkBase.get_next_id())

        output = self.linked.append(self.left_dataset)
        output = output.sort_values(['ENTITY_ID'])

        dataset = self.project['datasets'][0]

        try:
            usecols = dataset['columns'] or self.left_columns
        except KeyError:
            usecols = self.left_columns

        self.left_dataset = pd.read_csv(dataset['url'],
                                        index_col=dataset['index_field'],
                                        usecols=usecols,
                                        skipinitialspace=True,
                                        dtype=self.left_dtypes)

        result = pd.concat([self.left_dataset, output['ENTITY_ID']],
                           axis=1,
                           join='inner')
        cols = result.columns.tolist()
        cols.insert(0, cols.pop(cols.index('ENTITY_ID')))
        result = result[cols]

        self.total_entities = len(output.groupby(['ENTITY_ID']))

        logger.info('Total number of entities after de-duplication: %s',
                    self.total_entities)
        # Storing deduplication result. It contains the original records plus the entity id of each record.
        deduped_file_path = self.project['output_root'] + link_config.get(
            'deduped_data_file', 'deduped_data.csv')

        result['ENTITY_ID'] = result['ENTITY_ID'].map(
            lambda x: '{:.0f}'.format(x) if pd.notnull(x) else np.nan)

        result.replace(np.nan, '', regex=True)
        result.to_csv(deduped_file_path,
                      index_label=dataset['index_field'],
                      header=True,
                      index=True)
        logger.info('De-duplicated file generated at %s.', deduped_file_path)

        # Clean all remaining temp files
        if os.path.exists(self.temp_path):
            shutil.rmtree(self.temp_path)

        logger.debug('<<--- save ---<<')
        return generate_linking_summary(self, self.project['output_root'])
Example #3
0
    def save(self):
        """
        Create the de-duplicated output file sorted by entity id's and
        Generates the de-duplicated file.
        Preconditions: All de-duplication steps must be completed.
        :return: De-duplication summary report.
        """
        logger.debug('>>--- save --->>')
        logger.info('Saving results of the de-duplication project %s-%s',
                    self.project['name'], self.project['task_uuid'])
        # Adding the selected (de-duped) entities to the final result
        selected_rows = self.temp_path + LinkFiles.TEMP_DEDUP_ALL_SELECTED
        data_reader = pd.read_csv(self.left_file,
                                  usecols=self.left_columns,
                                  dtype=self.left_dtypes,
                                  skipinitialspace=True,
                                  chunksize=CHUNK_SIZE)

        # Storing deduplication result.
        # It contains the original records plus the entity id of each record.
        deduped_file_path = self.output_root + link_config.get(
            'deduped_data_file', 'deduped_data.csv')

        file_mode = 'a'
        header = False
        if os.path.isfile(selected_rows):
            os.rename(selected_rows, deduped_file_path)
        else:
            file_mode = 'w'
            header = True

        # Assign unique entity id to all remaining records.
        logger.info('Assigning entity id to all remaining records.')
        total_remained = 0
        with open(deduped_file_path, file_mode) as out_file:
            for chunk in data_reader:
                chunk.insert(0, 'ENTITY_ID', np.nan)
                for rec_id in chunk.index.values:
                    chunk.set_value(rec_id, 'ENTITY_ID',
                                    ChunkedLinkBase.get_next_id())
                    total_remained += 1
                chunk.replace(np.nan, '', regex=True)
                chunk.to_csv(out_file, index=False, header=header)
                header = False

        # Total number of entities after de-duplication
        self.total_entities += total_remained
        logger.info('Total number of entities after de-duplication: %s',
                    self.total_entities)

        # Clean all remaining temp files
        if os.path.exists(self.temp_path):
            shutil.rmtree(self.temp_path)

        logger.info('De-duplicated file generated at %s.', deduped_file_path)
        logger.debug('<<--- save ---<<')
        # Generating de-duplication summary report
        return generate_linking_summary(self, self.output_root)
Example #4
0
    def save(self):
        logger.debug('>>--- save --->>')
        logger.info(
            "Preparing output file of the linking project %s with tsk id %s.",
            self.project['name'], self.project['task_uuid'])

        linked_file_path = self.output_root + link_config.get(
            'linked_data_file', 'linked_data.csv')

        linked_filename = self.temp_path + LinkFiles.TEMP_LINKED_RECORDS
        temp_sorted_file = self.temp_path + LinkFiles.TEMP_SORTED_FILE

        if self.total_records_linked > 0:
            sort_csv(linked_filename,
                     appendfile=temp_sorted_file,
                     cols=['LINK_ID'],
                     types={'LINK_ID': 'numeric'},
                     work_dir=self.temp_path)
            if os.path.isfile(temp_sorted_file):
                os.rename(temp_sorted_file, linked_file_path)

        if os.path.isfile(linked_filename):
            os.remove(linked_filename)

        sort_csv(self.left_file,
                 appendfile=temp_sorted_file,
                 cols=[self.left_index],
                 types={self.left_index: 'numeric'},
                 work_dir=self.temp_path)

        if os.path.isfile(self.left_file):
            os.remove(self.left_file)
        if os.path.isfile(temp_sorted_file):
            os.rename(temp_sorted_file, self.left_file)

        sort_csv(self.right_file,
                 appendfile=temp_sorted_file,
                 cols=[self.right_index],
                 types={self.right_index: 'numeric'},
                 work_dir=self.temp_path)

        if os.path.isfile(self.right_file):
            os.remove(self.right_file)
        if os.path.isfile(temp_sorted_file):
            os.rename(temp_sorted_file, self.right_file)

        # Clean all remaining temp files
        if os.path.exists(self.temp_path):
            shutil.rmtree(self.temp_path)

        logger.info('Linking output file generated at %s.', linked_file_path)
        logger.debug('<<--- save ---<<')

        return generate_linking_summary(self, self.output_root)
Example #5
0
    def save_linked_data(self, data, append=False):
        logger.debug('>>--- save_linked_data --->>')
        file_path = self.project['output_root'] + link_config.get(
            'dedup_matched_file', 'dedup_matched.csv')

        data.replace(np.nan, '', regex=True)
        if not append:
            data.to_csv(file_path)
        else:
            with open(file_path, 'a') as f:
                data.to_csv(f, header=False)

        logger.debug('<<--- save_linked_data ---<<')
Example #6
0
    def load_data(self):
        logger.debug('>>--- load_data --->>')
        logger.info('Loading input dataset for project: %s with task id: %s.',
                    self.project['name'], self.project['task_uuid'])

        dataset = self.project['datasets'][0]
        self.left_columns.append(dataset['index_field'])

        if 'data_types' in dataset:
            left_dtypes = {}
            for col_name, col_type in dataset["data_types"].items():
                left_dtypes[col_name] = COLUMN_TYPES[col_type]
        else:
            left_dtypes = None

        try:
            usecols = dataset['columns'] or self.left_columns
        except KeyError:
            usecols = self.left_columns

        self.left_dtypes = self.right_dtypes = left_dtypes
        self.left_columns = self.right_columns = usecols

        logger.debug('Data columns: %s.', self.left_columns)
        logger.debug('Data types: %s', self.left_dtypes)

        self.right_file = self.left_file = \
            self.output_root + link_config.get('left_file', 'left_file.csv')

        super(ChunkedDedup, self).import_data(dataset['url'],
                                              usecols,
                                              self.left_file,
                                              front_cols=[self.left_index],
                                              data_types=self.left_dtypes)

        logger.debug('<<--- load_data ---<<')
Example #7
0
    def run(self):
        """
        Runs a de-duplication project consisting of a sequence of steps.
        Each step is defined by a set of blocking and linking identifiers and rules.
        :return: A de-duplicated version of the original data file and the de-duplication summary report.
        """
        logger.debug('>>--- run --->>')
        logger.info('Executing de-duplication project %s. Task id: %s.',
                    self.project['name'], self.project['task_uuid'])

        ChunkedLinkBase.reset_id()
        self.steps = {}
        self.linked = pd.DataFrame()

        matched_file = self.temp_path + LinkFiles.MATCHED_RECORDS

        selected_filename = self.temp_path + LinkFiles.TEMP_DEDUP_STEP_SELECTED
        final_selected_file = self.temp_path + LinkFiles.TEMP_DEDUP_ALL_SELECTED

        dedup_results_file = self.output_root + link_config.get(
            'dedup_matched_file', 'dedup_matched.csv')

        linked_file = self.temp_path + LinkFiles.TEMP_ENTITIES_FILE

        open(matched_file, 'w').close()

        linked_stats = {}
        prev_total = 0
        self.total_entities = 0
        first_batch = True
        for step in self.project['steps']:
            self.steps[step['seq']] = {}
            logger.info("De-duplication Step %s :", step['seq'])
            logger.info(
                "%s.1) Finding record pairs satisfying blocking and linking constraints...",
                step['seq'])

            pairs_count = self.pair_n_match(step=step['seq'],
                                            link_method=step['linking_method'],
                                            blocking=step['blocking_schema'],
                                            linking=step['linking_schema'],
                                            matched_file=matched_file)

            # This is required in case some intermediate steps have no results.
            # The results from previous steps will not be merged and counted.
            if pairs_count == 0:
                pairs_count = prev_total

            linked_stats[step['seq']] = pairs_count - prev_total
            prev_total = pairs_count

            logger.debug('Total records matched at step %s: %s', step['seq'],
                         linked_stats[step['seq']])

            # Skip the step if no records matched.
            if step['group'] and pairs_count > 0:
                step_total_entities = self.link_pairs()
                logger.debug('Total entities found at step %s: %s',
                             step['seq'], step_total_entities)

                self.total_entities += step_total_entities

                self.extract_rows(data_filename=self.left_file,
                                  data_id=self.left_index,
                                  index_filename=linked_file,
                                  index_id='REC_ID',
                                  index_cols=['ENTITY_ID'])

                sort_csv(selected_filename,
                         appendfile=final_selected_file,
                         cols=['ENTITY_ID'],
                         types={'ENTITY_ID': 'numeric'},
                         work_dir=self.temp_path)

                self.total_records_linked += pairs_count

                ChunkedLinkBase.append_rows(dedup_results_file,
                                            matched_file,
                                            first_batch=first_batch)
                first_batch = False
                open(matched_file, 'w').close()
                prev_total = 0

        for step in self.project['steps']:
            self.steps[step['seq']]['total_records_linked'] = linked_stats.get(
                step['seq'], 0)

        if os.path.isfile(matched_file):
            os.remove(matched_file)
        if os.path.isfile(linked_file):
            os.remove(linked_file)
        if os.path.isfile(selected_filename):
            os.remove(selected_filename)

        logger.info(
            'Execution of de-duplication project %s with Task id: %s completed.',
            self.project['name'], self.project['task_uuid'])
        logger.debug('<<--- run ---<<')
Example #8
0
    def load_data(self):
        logger.debug('>>--- load_data --->>')
        logger.info('Loading input datasets for project: %s with task id: %s.',
                    self.project['name'], self.project['task_uuid'])

        left_data = self.project['datasets'][0]
        self.left_columns.append(left_data['index_field'])
        self.left_columns.append(left_data['entity_field'])

        if 'data_types' in left_data:
            left_dtypes = {}
            for col_name, col_type in left_data["data_types"].items():
                left_dtypes[col_name] = COLUMN_TYPES[col_type]
        else:
            left_dtypes = None

        try:
            usecols = left_data['columns'] or self.left_columns
        except KeyError:
            usecols = self.left_columns

        self.left_dtypes = left_dtypes
        self.left_columns = usecols

        logger.debug('Left data columns: %s.', self.left_columns)
        logger.debug('Left data types: %s', self.left_dtypes)

        self.left_file = self.output_root + \
            link_config.get('left_file', 'left_file.csv')

        super(ChunkedLink,
              self).import_data(left_data['url'],
                                columns=usecols,
                                dest_filename=self.left_file,
                                front_cols=[self.left_index, self.left_entity],
                                data_types=self.left_dtypes)

        right_data = self.project['datasets'][1]
        self.right_columns.append(right_data['index_field'])
        self.right_columns.append(right_data['entity_field'])

        if 'data_types' in right_data:
            right_dtypes = {}
            for col_name, col_type in right_data["data_types"].items():
                right_dtypes[col_name] = COLUMN_TYPES[col_type]
        else:
            right_dtypes = None

        try:
            usecols = right_data['columns'] or self.right_columns
        except KeyError:
            usecols = self.right_columns

        self.right_dtypes = right_dtypes
        self.right_columns = usecols

        logger.debug('Right data columns: %s.', self.right_columns)
        logger.debug('Right data types: %s', self.right_dtypes)

        self.right_file = self.output_root + \
            link_config.get('right_file', 'right_file.csv')

        super(ChunkedLink, self).import_data(
            right_data['url'],
            usecols,
            self.right_file,
            front_cols=[self.right_index, self.right_entity],
            data_types=self.right_dtypes)

        logger.debug('<<--- load_data ---<<')
Example #9
0
    def run(self):
        logger.debug('>>--- run --->>')
        logger.info('Executing linking project %s. Task id: %s.',
                    self.project['name'], self.project['task_uuid'])

        ChunkedLinkBase.reset_id()
        self.steps = {}
        linked_stats = {}
        self.total_entities = 0
        self.total_records_linked = 0

        matched_file = self.temp_path + LinkFiles.MATCHED_RECORDS

        matched_not_linked_filename = self.output_root \
            + link_config.get('matched_not_linked_filename', 'matched_not_linked_data.csv')
        linked_filename = self.temp_path + LinkFiles.TEMP_LINKED_RECORDS
        step_linked = self.temp_path + LinkFiles.TEMP_STEP_LINKED_FILE
        temp_sorted_file = self.temp_path + LinkFiles.TEMP_SORTED_FILE

        open(linked_filename, 'w').close()
        open(matched_not_linked_filename, 'w').close()

        first_batch = True

        for step in self.project['steps']:
            # Sort input files based on entity_id and ingestion id:
            sort_csv(self.left_file,
                     appendfile=temp_sorted_file,
                     cols=[self.left_index],
                     types={self.left_index: 'numeric'},
                     work_dir=self.temp_path)

            os.remove(self.left_file)
            os.rename(temp_sorted_file, self.left_file)
            sort_csv(self.right_file,
                     appendfile=temp_sorted_file,
                     cols=[self.right_index],
                     types={self.right_index: 'numeric'},
                     work_dir=self.temp_path)
            os.remove(self.right_file)
            os.rename(temp_sorted_file, self.right_file)

            self.steps[step['seq']] = {}
            logger.info("Linking Step %s :", step['seq'])
            logger.info(
                "%s.1) Finding record pairs satisfying blocking and linking constraints...",
                step['seq'])

            open(matched_file, 'w').close()
            pairs_count = self.pair_n_match(step=step['seq'],
                                            link_method=step['linking_method'],
                                            blocking=step['blocking_schema'],
                                            linking=step['linking_schema'],
                                            matched_file=matched_file)

            linked_stats[step['seq']] = pairs_count

            if pairs_count == 0:
                logger.info('No records matched at step %s', step['seq'])
                self.steps[step['seq']]['total_records_linked'] = 0
                self.steps[step['seq']]['total_matched_not_linked'] = 0
                self.steps[step['seq']]['total_entities'] = 0
                continue

            logger.info(
                "%s.3) Identifying the linked records based on the relationship type...",
                step['seq'])
            link_stats = self.link(self.project['relationship_type'])

            self.steps[step['seq']]['total_records_linked'] = link_stats[
                'total_records_linked']
            self.total_records_linked += link_stats['total_records_linked']
            self.steps[step['seq']]['total_matched_not_linked'] = link_stats[
                'total_filtered']
            self.steps[
                step['seq']]['total_entities'] = link_stats['total_linked']
            self.total_entities += self.steps[step['seq']]['total_entities']

            # Sort input files based on entity_id and ingestion id:
            sort_csv(self.left_file,
                     appendfile=temp_sorted_file,
                     cols=[self.left_entity, self.left_index],
                     types={
                         self.left_entity: 'numeric',
                         self.left_index: 'numeric'
                     },
                     work_dir=self.temp_path)
            os.remove(self.left_file)
            os.rename(temp_sorted_file, self.left_file)
            sort_csv(self.right_file,
                     appendfile=temp_sorted_file,
                     cols=[self.right_entity, self.right_index],
                     types={
                         self.right_entity: 'numeric',
                         self.right_index: 'numeric'
                     },
                     work_dir=self.temp_path)
            os.remove(self.right_file)
            os.rename(temp_sorted_file, self.right_file)

            self.extract_linked_records(linked_filename=step_linked,
                                        prefix='LEFT_')
            self.extract_linked_records(linked_filename=step_linked,
                                        prefix='RIGHT_')

            ChunkedLinkBase.append_rows(linked_filename,
                                        step_linked,
                                        first_batch=first_batch)
            first_batch = False

        if os.path.isfile(step_linked):
            os.remove(step_linked)
        if os.path.isfile(matched_file):
            os.remove(matched_file)

        logger.info(
            'Execution of linking project %s with Task id: %s is completed.',
            self.project['name'], self.project['task_uuid'])
        logger.debug('<<--- run ---<<')