def save(self): logger.debug('>>--- save --->>') left_index = 'LEFT_' + self.left_index right_index = 'RIGHT_' + self.right_index left_entity_id = 'LEFT_' + self.left_entity right_entity_id = 'RIGHT_' + self.right_entity grouped = self.matched_not_linked.reset_index().groupby( [left_index, right_index, left_entity_id, right_entity_id]).agg({'STEP': 'min'}) self.matched_not_linked = pd.DataFrame(grouped) # Storing linked data records. logger.info( "Preparing output files of the linking project %s with tsk id %s.", self.project['name'], self.project['task_uuid']) linked_file_path = self.project['output_root'] + link_config.get( 'linked_data_file', 'linked_data.csv') self.linked['STEP'] = self.linked['STEP'].map( lambda x: '{:.0f}'.format(x) if pd.notnull(x) else np.nan) self.linked[right_entity_id] = self.linked[right_entity_id].map( lambda x: '{:.0f}'.format(x) if pd.notnull(x) else np.nan) self.linked[left_entity_id] = self.linked[left_entity_id].map( lambda x: '{:.0f}'.format(x) if pd.notnull(x) else np.nan) if np.issubdtype(self.left_index_type, np.integer): self.linked[left_index] = self.linked[left_index].map( lambda x: '{:.0f}'.format(x) if pd.notnull(x) else np.nan) if np.issubdtype(self.right_index_type, np.integer): self.linked[right_index] = self.linked[right_index].map( lambda x: '{:.0f}'.format(x) if pd.notnull(x) else np.nan) self.linked = self.linked.sort_values(['LINK_ID']) self.linked.replace(np.nan, '', regex=True) self.linked.to_csv(linked_file_path, index=False) # Storing matched but not linked records. matched_file_path = self.project['output_root'] + link_config.get( 'matched_not_linked_filename', 'matched_not_linked_data.csv') self.matched_not_linked.replace(np.nan, '', regex=True) self.matched_not_linked.to_csv(matched_file_path) logger.info('Linking output files generated: %s,\n %s.', linked_file_path, matched_file_path) # Clean all remaining temp files if os.path.exists(self.temp_path): shutil.rmtree(self.temp_path) logger.debug('<<--- save ---<<') return generate_linking_summary(self, self.project['output_root'])
def save(self): """ Generates the de-duplicated file. :return: """ logger.debug('>>--- save --->>') # Assign entity id to all remaining records. logger.info('Assigning entity id to all remaining records.') for rec_id in self.left_dataset.index.values: self.left_dataset.set_value(rec_id, 'ENTITY_ID', MemoryLinkBase.get_next_id()) output = self.linked.append(self.left_dataset) output = output.sort_values(['ENTITY_ID']) dataset = self.project['datasets'][0] try: usecols = dataset['columns'] or self.left_columns except KeyError: usecols = self.left_columns self.left_dataset = pd.read_csv(dataset['url'], index_col=dataset['index_field'], usecols=usecols, skipinitialspace=True, dtype=self.left_dtypes) result = pd.concat([self.left_dataset, output['ENTITY_ID']], axis=1, join='inner') cols = result.columns.tolist() cols.insert(0, cols.pop(cols.index('ENTITY_ID'))) result = result[cols] self.total_entities = len(output.groupby(['ENTITY_ID'])) logger.info('Total number of entities after de-duplication: %s', self.total_entities) # Storing deduplication result. It contains the original records plus the entity id of each record. deduped_file_path = self.project['output_root'] + link_config.get( 'deduped_data_file', 'deduped_data.csv') result['ENTITY_ID'] = result['ENTITY_ID'].map( lambda x: '{:.0f}'.format(x) if pd.notnull(x) else np.nan) result.replace(np.nan, '', regex=True) result.to_csv(deduped_file_path, index_label=dataset['index_field'], header=True, index=True) logger.info('De-duplicated file generated at %s.', deduped_file_path) # Clean all remaining temp files if os.path.exists(self.temp_path): shutil.rmtree(self.temp_path) logger.debug('<<--- save ---<<') return generate_linking_summary(self, self.project['output_root'])
def save(self): """ Create the de-duplicated output file sorted by entity id's and Generates the de-duplicated file. Preconditions: All de-duplication steps must be completed. :return: De-duplication summary report. """ logger.debug('>>--- save --->>') logger.info('Saving results of the de-duplication project %s-%s', self.project['name'], self.project['task_uuid']) # Adding the selected (de-duped) entities to the final result selected_rows = self.temp_path + LinkFiles.TEMP_DEDUP_ALL_SELECTED data_reader = pd.read_csv(self.left_file, usecols=self.left_columns, dtype=self.left_dtypes, skipinitialspace=True, chunksize=CHUNK_SIZE) # Storing deduplication result. # It contains the original records plus the entity id of each record. deduped_file_path = self.output_root + link_config.get( 'deduped_data_file', 'deduped_data.csv') file_mode = 'a' header = False if os.path.isfile(selected_rows): os.rename(selected_rows, deduped_file_path) else: file_mode = 'w' header = True # Assign unique entity id to all remaining records. logger.info('Assigning entity id to all remaining records.') total_remained = 0 with open(deduped_file_path, file_mode) as out_file: for chunk in data_reader: chunk.insert(0, 'ENTITY_ID', np.nan) for rec_id in chunk.index.values: chunk.set_value(rec_id, 'ENTITY_ID', ChunkedLinkBase.get_next_id()) total_remained += 1 chunk.replace(np.nan, '', regex=True) chunk.to_csv(out_file, index=False, header=header) header = False # Total number of entities after de-duplication self.total_entities += total_remained logger.info('Total number of entities after de-duplication: %s', self.total_entities) # Clean all remaining temp files if os.path.exists(self.temp_path): shutil.rmtree(self.temp_path) logger.info('De-duplicated file generated at %s.', deduped_file_path) logger.debug('<<--- save ---<<') # Generating de-duplication summary report return generate_linking_summary(self, self.output_root)
def save(self): logger.debug('>>--- save --->>') logger.info( "Preparing output file of the linking project %s with tsk id %s.", self.project['name'], self.project['task_uuid']) linked_file_path = self.output_root + link_config.get( 'linked_data_file', 'linked_data.csv') linked_filename = self.temp_path + LinkFiles.TEMP_LINKED_RECORDS temp_sorted_file = self.temp_path + LinkFiles.TEMP_SORTED_FILE if self.total_records_linked > 0: sort_csv(linked_filename, appendfile=temp_sorted_file, cols=['LINK_ID'], types={'LINK_ID': 'numeric'}, work_dir=self.temp_path) if os.path.isfile(temp_sorted_file): os.rename(temp_sorted_file, linked_file_path) if os.path.isfile(linked_filename): os.remove(linked_filename) sort_csv(self.left_file, appendfile=temp_sorted_file, cols=[self.left_index], types={self.left_index: 'numeric'}, work_dir=self.temp_path) if os.path.isfile(self.left_file): os.remove(self.left_file) if os.path.isfile(temp_sorted_file): os.rename(temp_sorted_file, self.left_file) sort_csv(self.right_file, appendfile=temp_sorted_file, cols=[self.right_index], types={self.right_index: 'numeric'}, work_dir=self.temp_path) if os.path.isfile(self.right_file): os.remove(self.right_file) if os.path.isfile(temp_sorted_file): os.rename(temp_sorted_file, self.right_file) # Clean all remaining temp files if os.path.exists(self.temp_path): shutil.rmtree(self.temp_path) logger.info('Linking output file generated at %s.', linked_file_path) logger.debug('<<--- save ---<<') return generate_linking_summary(self, self.output_root)
def save_linked_data(self, data, append=False): logger.debug('>>--- save_linked_data --->>') file_path = self.project['output_root'] + link_config.get( 'dedup_matched_file', 'dedup_matched.csv') data.replace(np.nan, '', regex=True) if not append: data.to_csv(file_path) else: with open(file_path, 'a') as f: data.to_csv(f, header=False) logger.debug('<<--- save_linked_data ---<<')
def load_data(self): logger.debug('>>--- load_data --->>') logger.info('Loading input dataset for project: %s with task id: %s.', self.project['name'], self.project['task_uuid']) dataset = self.project['datasets'][0] self.left_columns.append(dataset['index_field']) if 'data_types' in dataset: left_dtypes = {} for col_name, col_type in dataset["data_types"].items(): left_dtypes[col_name] = COLUMN_TYPES[col_type] else: left_dtypes = None try: usecols = dataset['columns'] or self.left_columns except KeyError: usecols = self.left_columns self.left_dtypes = self.right_dtypes = left_dtypes self.left_columns = self.right_columns = usecols logger.debug('Data columns: %s.', self.left_columns) logger.debug('Data types: %s', self.left_dtypes) self.right_file = self.left_file = \ self.output_root + link_config.get('left_file', 'left_file.csv') super(ChunkedDedup, self).import_data(dataset['url'], usecols, self.left_file, front_cols=[self.left_index], data_types=self.left_dtypes) logger.debug('<<--- load_data ---<<')
def run(self): """ Runs a de-duplication project consisting of a sequence of steps. Each step is defined by a set of blocking and linking identifiers and rules. :return: A de-duplicated version of the original data file and the de-duplication summary report. """ logger.debug('>>--- run --->>') logger.info('Executing de-duplication project %s. Task id: %s.', self.project['name'], self.project['task_uuid']) ChunkedLinkBase.reset_id() self.steps = {} self.linked = pd.DataFrame() matched_file = self.temp_path + LinkFiles.MATCHED_RECORDS selected_filename = self.temp_path + LinkFiles.TEMP_DEDUP_STEP_SELECTED final_selected_file = self.temp_path + LinkFiles.TEMP_DEDUP_ALL_SELECTED dedup_results_file = self.output_root + link_config.get( 'dedup_matched_file', 'dedup_matched.csv') linked_file = self.temp_path + LinkFiles.TEMP_ENTITIES_FILE open(matched_file, 'w').close() linked_stats = {} prev_total = 0 self.total_entities = 0 first_batch = True for step in self.project['steps']: self.steps[step['seq']] = {} logger.info("De-duplication Step %s :", step['seq']) logger.info( "%s.1) Finding record pairs satisfying blocking and linking constraints...", step['seq']) pairs_count = self.pair_n_match(step=step['seq'], link_method=step['linking_method'], blocking=step['blocking_schema'], linking=step['linking_schema'], matched_file=matched_file) # This is required in case some intermediate steps have no results. # The results from previous steps will not be merged and counted. if pairs_count == 0: pairs_count = prev_total linked_stats[step['seq']] = pairs_count - prev_total prev_total = pairs_count logger.debug('Total records matched at step %s: %s', step['seq'], linked_stats[step['seq']]) # Skip the step if no records matched. if step['group'] and pairs_count > 0: step_total_entities = self.link_pairs() logger.debug('Total entities found at step %s: %s', step['seq'], step_total_entities) self.total_entities += step_total_entities self.extract_rows(data_filename=self.left_file, data_id=self.left_index, index_filename=linked_file, index_id='REC_ID', index_cols=['ENTITY_ID']) sort_csv(selected_filename, appendfile=final_selected_file, cols=['ENTITY_ID'], types={'ENTITY_ID': 'numeric'}, work_dir=self.temp_path) self.total_records_linked += pairs_count ChunkedLinkBase.append_rows(dedup_results_file, matched_file, first_batch=first_batch) first_batch = False open(matched_file, 'w').close() prev_total = 0 for step in self.project['steps']: self.steps[step['seq']]['total_records_linked'] = linked_stats.get( step['seq'], 0) if os.path.isfile(matched_file): os.remove(matched_file) if os.path.isfile(linked_file): os.remove(linked_file) if os.path.isfile(selected_filename): os.remove(selected_filename) logger.info( 'Execution of de-duplication project %s with Task id: %s completed.', self.project['name'], self.project['task_uuid']) logger.debug('<<--- run ---<<')
def load_data(self): logger.debug('>>--- load_data --->>') logger.info('Loading input datasets for project: %s with task id: %s.', self.project['name'], self.project['task_uuid']) left_data = self.project['datasets'][0] self.left_columns.append(left_data['index_field']) self.left_columns.append(left_data['entity_field']) if 'data_types' in left_data: left_dtypes = {} for col_name, col_type in left_data["data_types"].items(): left_dtypes[col_name] = COLUMN_TYPES[col_type] else: left_dtypes = None try: usecols = left_data['columns'] or self.left_columns except KeyError: usecols = self.left_columns self.left_dtypes = left_dtypes self.left_columns = usecols logger.debug('Left data columns: %s.', self.left_columns) logger.debug('Left data types: %s', self.left_dtypes) self.left_file = self.output_root + \ link_config.get('left_file', 'left_file.csv') super(ChunkedLink, self).import_data(left_data['url'], columns=usecols, dest_filename=self.left_file, front_cols=[self.left_index, self.left_entity], data_types=self.left_dtypes) right_data = self.project['datasets'][1] self.right_columns.append(right_data['index_field']) self.right_columns.append(right_data['entity_field']) if 'data_types' in right_data: right_dtypes = {} for col_name, col_type in right_data["data_types"].items(): right_dtypes[col_name] = COLUMN_TYPES[col_type] else: right_dtypes = None try: usecols = right_data['columns'] or self.right_columns except KeyError: usecols = self.right_columns self.right_dtypes = right_dtypes self.right_columns = usecols logger.debug('Right data columns: %s.', self.right_columns) logger.debug('Right data types: %s', self.right_dtypes) self.right_file = self.output_root + \ link_config.get('right_file', 'right_file.csv') super(ChunkedLink, self).import_data( right_data['url'], usecols, self.right_file, front_cols=[self.right_index, self.right_entity], data_types=self.right_dtypes) logger.debug('<<--- load_data ---<<')
def run(self): logger.debug('>>--- run --->>') logger.info('Executing linking project %s. Task id: %s.', self.project['name'], self.project['task_uuid']) ChunkedLinkBase.reset_id() self.steps = {} linked_stats = {} self.total_entities = 0 self.total_records_linked = 0 matched_file = self.temp_path + LinkFiles.MATCHED_RECORDS matched_not_linked_filename = self.output_root \ + link_config.get('matched_not_linked_filename', 'matched_not_linked_data.csv') linked_filename = self.temp_path + LinkFiles.TEMP_LINKED_RECORDS step_linked = self.temp_path + LinkFiles.TEMP_STEP_LINKED_FILE temp_sorted_file = self.temp_path + LinkFiles.TEMP_SORTED_FILE open(linked_filename, 'w').close() open(matched_not_linked_filename, 'w').close() first_batch = True for step in self.project['steps']: # Sort input files based on entity_id and ingestion id: sort_csv(self.left_file, appendfile=temp_sorted_file, cols=[self.left_index], types={self.left_index: 'numeric'}, work_dir=self.temp_path) os.remove(self.left_file) os.rename(temp_sorted_file, self.left_file) sort_csv(self.right_file, appendfile=temp_sorted_file, cols=[self.right_index], types={self.right_index: 'numeric'}, work_dir=self.temp_path) os.remove(self.right_file) os.rename(temp_sorted_file, self.right_file) self.steps[step['seq']] = {} logger.info("Linking Step %s :", step['seq']) logger.info( "%s.1) Finding record pairs satisfying blocking and linking constraints...", step['seq']) open(matched_file, 'w').close() pairs_count = self.pair_n_match(step=step['seq'], link_method=step['linking_method'], blocking=step['blocking_schema'], linking=step['linking_schema'], matched_file=matched_file) linked_stats[step['seq']] = pairs_count if pairs_count == 0: logger.info('No records matched at step %s', step['seq']) self.steps[step['seq']]['total_records_linked'] = 0 self.steps[step['seq']]['total_matched_not_linked'] = 0 self.steps[step['seq']]['total_entities'] = 0 continue logger.info( "%s.3) Identifying the linked records based on the relationship type...", step['seq']) link_stats = self.link(self.project['relationship_type']) self.steps[step['seq']]['total_records_linked'] = link_stats[ 'total_records_linked'] self.total_records_linked += link_stats['total_records_linked'] self.steps[step['seq']]['total_matched_not_linked'] = link_stats[ 'total_filtered'] self.steps[ step['seq']]['total_entities'] = link_stats['total_linked'] self.total_entities += self.steps[step['seq']]['total_entities'] # Sort input files based on entity_id and ingestion id: sort_csv(self.left_file, appendfile=temp_sorted_file, cols=[self.left_entity, self.left_index], types={ self.left_entity: 'numeric', self.left_index: 'numeric' }, work_dir=self.temp_path) os.remove(self.left_file) os.rename(temp_sorted_file, self.left_file) sort_csv(self.right_file, appendfile=temp_sorted_file, cols=[self.right_entity, self.right_index], types={ self.right_entity: 'numeric', self.right_index: 'numeric' }, work_dir=self.temp_path) os.remove(self.right_file) os.rename(temp_sorted_file, self.right_file) self.extract_linked_records(linked_filename=step_linked, prefix='LEFT_') self.extract_linked_records(linked_filename=step_linked, prefix='RIGHT_') ChunkedLinkBase.append_rows(linked_filename, step_linked, first_batch=first_batch) first_batch = False if os.path.isfile(step_linked): os.remove(step_linked) if os.path.isfile(matched_file): os.remove(matched_file) logger.info( 'Execution of linking project %s with Task id: %s is completed.', self.project['name'], self.project['task_uuid']) logger.debug('<<--- run ---<<')