def read_csv_(*args, **kwargs): """ Read CSV (comma-separated) file into MTable Parameters ---------- args : arguments to pandas read_csv command kwargs : arguments to pandas read_csv command along with optional "key" parameter. If key parameter is given, then it will be set as key, else a new attribute ("_id") is added and set as key Returns ------- result : MTable """ # if kwargs.has_key('key') is False: # raise AttributeError('Key is not specified') key = kwargs.pop('key', None) df = pd.read_csv(*args, **kwargs) if key is not None: return MTable(df, key=key) else: df = MTable(df) #key_name=df._get_name_for_key(df.columns) #df.add_key(key_name) return df
def create_mtable(table, key=None, ltable=None, rtable=None, foreign_key_ltable=None, foreign_key_rtable=None): """ Create mtable from dataframe """ out_table = MTable(table, key=key) truth_vals = [ ltable is not None, rtable is not None, foreign_key_ltable is not None, foreign_key_rtable is not None ] if all(truth_vals) == True: out_table.set_property('ltable', ltable) out_table.set_property('rtable', rtable) out_table.set_property('foreign_key_ltable', foreign_key_ltable) out_table.set_property('foreign_key_rtable', foreign_key_rtable) else: if any(truth_vals) == True: logging.getLogger(__name__).warning( 'Not all the properties for vtable are given; so not setting ' 'any of them') return out_table
def sample_table(table, size, replace=False): """ Sample MTable Parameters ---------- table : MTable, input table to be sampled size : int, number of samples replace : boolean, whether sampling should be done with replacement. By default, it is set to False. Returns ------- sampled_table: MTable, sampled table """ if len(table) == 0: raise AttributeError('size of table is 0') if len(table) < size: raise AttributeError('sample size is larger than input table size') s_indices = np.random.choice(len(table), size, replace=replace) # sort the indices - just to have an order s_indices = sorted(s_indices) sampled_table = table.iloc[list(s_indices)] #print sampled_table.properties sampled_table = MTable(sampled_table, key=table.get_key()) sampled_table.properties = table.properties return sampled_table
def down_sample(ltable, rtable, size, y): s_table, b_table, is_swapped = _order_tables(ltable, rtable) s_inv_index = _inv_index(s_table) b_sample_size = min(math.floor(size / y), len(b_table)) b_tbl_indices = list( np.random.choice(len(b_table), b_sample_size, replace=False)) s_tbl_indices = _probe_index(b_table.ix[b_tbl_indices], y, len(s_table), s_inv_index) s_tbl_indices = list(s_tbl_indices) if is_swapped: s_tbl_indices, b_tbl_indices = b_tbl_indices, s_tbl_indices l_sampled = MTable(ltable.iloc[list(s_tbl_indices)], key=ltable.get_key()) l_sampled.properties = ltable.properties r_sampled = MTable(rtable.iloc[list(b_tbl_indices)], key=rtable.get_key()) r_sampled.properties = rtable.properties return l_sampled, r_sampled
def read_csv(file_path, **kwargs): """ Read CSV (comma-separated) file into MTable Parameters ---------- args : arguments to pandas read_csv command kwargs : arguments to pandas read_csv command along with optional "key" if its MTable or "key", "ltable", "rtable", "foreign_key_ltable", "foreign_key_rtable" if its VTable Returns ------- result : MTable Note ---- read_csv can read in the meta data mentioned at the beginning of the file like this: #key=id A user can override or supply metadata as key-value args to the function """ properties, num_lines = get_properties_from_file(file_path) properties, kwargs = update_properties(properties, **kwargs) check_properties(properties) kwargs['skiprows']=num_lines df = pd.read_csv(file_path, **kwargs) # get key key = properties.pop('key', None) if key is not None: df = MTable(df, key=key) else: df = MTable(df) for k, v in properties.iteritems(): df.set_property(k, v) return df
def label_table(tbl, col_name, replace=True): """ Label training data Parameters ---------- tbl : MTable, Table to be labeled col_name : String, Name of the label column replace : Boolean, specifies whether the column with the given 'col_name' must be overwritten, if it already exists. [This option is currently experimental]. Returns ------- result : MTable, Table with labels Notes ----- The label value is expected to be only 0 or 1. """ from magellan.gui.mtable_gui import edit table = tbl.copy() if col_name in table.columns: if replace == True: logging.getLogger(__name__).warning( 'Input table already contains column %s. ' '' % col_name) table[col_name] = 0 else: table[col_name] = 0 mg.edit(table) table[col_name] = table[col_name].astype(int) # check if the table contains only 0s and 1s c1 = table[col_name] == 1 c2 = table[col_name] == 0 c = sum(c1 | c2) assert c == len( table), 'The label column contains values other than 0 and 1' table = MTable(table, key=tbl.get_key()) table.properties = tbl.properties return table
def _combine_block_outputs_via_union(blocker_output_list): """ Combine blocker outputs by unioning ltable, rtable ids in candidate set Parameters ---------- blocker_output_list : list List of blocker outputs Returns ------- combined_blocker_output : MTable With combined blocker outputs Notes ----- Combined_blocker_output contains the following attributes * _id * combined id pairs (ltable.id, rtabled.id) from list of blocker outputs * union of non-id attributes from each of blocker output """ ltable, rtable = lr_tables(blocker_output_list) # get the attribute names in blocker output that represents ltable, rtable l_key = 'ltable.' + ltable.get_key() r_key = 'rtable.' + rtable.get_key() # get the set of id pairs from all blocker output list id_set = set([(r[l_key], r[r_key]) for c in blocker_output_list for i, r in c.iterrows()]) # get the union of attribute names from blocker output list col_set = set([x for c in blocker_output_list for x in c.columns]) l_col, r_col = lr_cols(col_set) # convert ltable, rtable to dfs and set index l_df = ltable.to_dataframe() l_df.set_index(ltable.get_key(), inplace=True, drop=False) r_df = rtable.to_dataframe() r_df.set_index(rtable.get_key(), inplace=True, drop=False) # get the l_col, r_col from ltable and rtable respectively dict_list = [get_dict(l_df.ix[x[0]], r_df.ix[x[1]], l_col, r_col) for x in id_set] # convert list of dicts to dataframe table = pd.DataFrame(dict_list) # get the final column names for output table f_cols = fin_cols(l_col, r_col, ltable.get_key(), rtable.get_key()) if len(table) > 0: table.sort([l_key, r_key], inplace=True) table.reset_index(inplace=True, drop=True) table = MTable(table[f_cols]) else: table = MTable(table, columns=f_cols) # project df and convert to MTable table.set_property('ltable', ltable) table.set_property('rtable', rtable) table.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key()) table.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key()) return table
def combine_block_outputs_via_union(blocker_output_list): """ Combine blocker outputs by unioning ltable, rtable ids in candidate set Parameters ---------- blocker_output_list : list List of blocker outputs Returns ------- combined_blocker_output : MTable With combined blocker outputs Notes ----- Combined_blocker_output contains the following attributes * _id * combined id pairs (ltable.id, rtabled.id) from list of blocker outputs * union of non-id attributes from each of blocker output """ ltable, rtable = lr_tables(blocker_output_list) # get the attribute names in blocker output that represents ltable, rtable l_key = 'ltable.' + ltable.get_key() r_key = 'rtable.' + rtable.get_key() l_df = ltable.to_dataframe() r_df = rtable.to_dataframe() # get the union of attribute names from blocker output list col_set = set([x for c in blocker_output_list for x in c.columns]) l_col, r_col = lr_cols(col_set) l_col = list(l_col) r_col = list(r_col) l_df = l_df[l_col] # minimally the projection must contain id column r_df = r_df[r_col] col_names = ['ltable.'+c for c in l_df.columns] l_df.columns = col_names col_names = ['rtable.'+c for c in r_df.columns] r_df.columns = col_names l_df.set_index(l_key, inplace=True, drop=False) r_df.set_index(r_key, inplace=True, drop=False) # get id pairs id_set = [] for c in blocker_output_list: lfid_idx = c.get_attr_names().index(c.get_property('foreign_key_ltable')) rfid_idx = c.get_attr_names().index(c.get_property('foreign_key_rtable')) for r in c.itertuples(index=False): id_set.append((r[lfid_idx], r[rfid_idx])) id_set = list(set(id_set)) f_cols = fin_cols(l_col, r_col, ltable.get_key(), rtable.get_key()) if len(id_set) > 0: id_df = pd.DataFrame(id_set) l_consol_table = l_df.ix[id_df[0]] r_consol_table = r_df.ix[id_df[1]] l_consol_table.reset_index(inplace=True, drop=True) r_consol_table.reset_index(inplace=True, drop=True) table = pd.concat([l_consol_table, r_consol_table], axis=1) table.sort([l_key, r_key], inplace=True) table.reset_index(inplace=True, drop=True) table = MTable(table[f_cols]) else: table = MTable([], columns=f_cols) # project df and convert to MTable table.set_property('ltable', ltable) table.set_property('rtable', rtable) table.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key()) table.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key()) return table
def block_candset(self, vtable, l_block_attr, r_block_attr): """ Block candidate set (virtual MTable) based on l_block_attr, r_block_attr equivalence (similar to equi-join) Parameters ---------- vtable : MTable Input candidate set l_block_attr, r_block_attr : string, attribute names in ltable, rtable Returns ------- blocked_table : MTable Containing tuple pairs whose l_block_attr and r_block_attr values are same Notes ----- Output MTable contains the following three attributes * _id * id column from ltable * id column from rtable Also, the properties of blocked table is updated with following key-value pairs * ltable - ref to ltable * rtable - ref to rtable * key * foreign_key_ltable - string, ltable's id attribute name * foreign_key_rtable - string, rtable's id attribute name """ # do integrity checks ltable = vtable.get_property('ltable') rtable = vtable.get_property('rtable') self.check_attrs(ltable, rtable, l_block_attr, r_block_attr, None, None) l_key = 'ltable.' + ltable.get_key() r_key = 'rtable.' + rtable.get_key() # convert to dataframes l_df = ltable.to_dataframe() r_df = rtable.to_dataframe() # set index for convenience l_df.set_index(ltable.get_key(), inplace=True) r_df.set_index(rtable.get_key(), inplace=True) if mg._verbose: count = 0 per_count = math.ceil(mg._percent / 100.0 * len(vtable)) print per_count elif mg._progbar: bar = pyprind.ProgBar(len(vtable)) # keep track of valid ids valid = [] # iterate candidate set and process each row for idx, row in vtable.iterrows(): if mg._verbose: count += 1 if count % per_count == 0: print str(mg._percent * count / per_count) + ' percentage done !!!' elif mg._progbar: bar.update() # get the value of block attribute from ltuple l_val = l_df.ix[row[l_key], l_block_attr] r_val = r_df.ix[row[r_key], r_block_attr] if l_val != np.NaN and r_val != np.NaN: if l_val == r_val: valid.append(True) else: valid.append(False) else: valid.append(False) # should be modified if len(vtable) > 0: out_table = MTable(vtable[valid], key=vtable.get_key()) else: out_table = MTable(columns=vtable.columns, key=vtable.get_key()) out_table.set_property('ltable', ltable) out_table.set_property('rtable', rtable) out_table.set_property('foreign_key_ltable', 'ltable.' + ltable.get_key()) out_table.set_property('foreign_key_rtable', 'rtable.' + rtable.get_key()) return out_table
def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None): """ Block tables based on l_block_attr, r_block_attr equivalence (similar to equi-join) Parameters ---------- ltable, rtable : MTable Input MTables l_block_attr, r_block_attr : string, attribute names in ltable, rtable l_output_attrs, r_output_attrs : list (of strings), defaults to None attribute names to be included in the output table Returns ------- blocked_table : MTable Containing tuple pairs whose l_block_attr and r_block_attr values are same Notes ----- Output MTable contains the following three attributes * _id * id column from ltable * id column from rtable Also, the properties of blocked table is updated with following key-value pairs * ltable - ref to ltable * rtable - ref to rtable * key * foreign_key_ltable - string, ltable's id attribute name * foreign_key_rtable - string, rtable's id attribute name """ # do integrity checks l_output_attrs, r_output_attrs = self.check_attrs( ltable, rtable, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs) # remove nans l_df = self.rem_nan(ltable, l_block_attr) r_df = self.rem_nan(rtable, r_block_attr) candset = pd.merge(l_df, r_df, left_on=l_block_attr, right_on=r_block_attr, suffixes=('_ltable', '_rtable')) # get output columns retain_cols, final_cols = self.output_columns(ltable.get_key(), rtable.get_key(), list(candset.columns), l_output_attrs, r_output_attrs) candset = candset[retain_cols] candset.columns = final_cols candset = MTable(candset) # set metadata candset.set_property('ltable', ltable) candset.set_property('rtable', rtable) candset.set_property('foreign_key_ltable', 'ltable.' + ltable.get_key()) candset.set_property('foreign_key_rtable', 'rtable.' + rtable.get_key()) return candset
def extract_feature_vecs(s, attrs_before=None, feature_table=None, attrs_after=None): """ Extract feature vectors Parameters ---------- s : MTable, labeled virtual MTable or combined blocker output attrs_before : list, defaults to None List of attribute names from "s" to be included in output table before the feature vector feat_table : pandas DataFrame, defaults to None List of features to be applied (also see: mg.get_features_for_blocking) attrs_after : list, defaults to None List of attribute names from "s" to be included in output table after the feature vector Returns ------- feature_vectors : MTable, Containing features values (obtained by applying feature fns in feat_table) and attributes as mentioned in the input """ # basic checks assert isJVMStarted( ), 'JVM should be started using init_jvm to compute features' ltable = s.get_property('ltable') rtable = s.get_property('rtable') assert ltable is not None, 'Left table is not set' assert rtable is not None, 'Right table is not set' if feature_table is None: feature_table = mg.get_features_for_blocking(ltable, rtable) l_key, r_key = s.get_property('foreign_key_ltable'), s.get_property( 'foreign_key_rtable') start = time.time() id_list = [(r[l_key], r[r_key]) for i, r in s.iterrows()] end = time.time() logging.getLogger(__name__).info('Iterating rows (%d) took %f secs' % (len(s), end - start)) # compute feature values l_df = ltable.to_dataframe() r_df = rtable.to_dataframe() l_df.set_index(ltable.get_key(), inplace=True, drop=False) r_df.set_index(rtable.get_key(), inplace=True, drop=False) start = time.time() feat_vals = [ apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table) for x in id_list ] end = time.time() logging.getLogger(__name__).info( 'Applying feature functions took : %f secs' % (end - start)) table = pd.DataFrame(feat_vals, index=s.index.values) # get the feature names and re-arrange columns in that order feat_names = list(feature_table['feature_name']) table = table[feat_names] # insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before.reverse() for a in attrs_before: table.insert(0, a, s[a]) table.insert(0, r_key, s[r_key]) table.insert(0, l_key, s[l_key]) # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after.reverse() for a in attrs_after: table.insert(len(table.columns), a, s[a]) # reset the table index table.reset_index(inplace=True, drop=True) feature_vectors = MTable(table) if s.get_key() not in feature_vectors.columns: feature_vectors.add_key(s.get_key()) # metadata feature_vectors._metadata = s._metadata feature_vectors.properties = s.properties return feature_vectors
def get_filtered_table(ltable, rtable, corres_list): ltable_cols = [col_pair[0] for col_pair in corres_list] rtable_cols = [col_pair[1] for col_pair in corres_list] l_mtable = MTable(ltable[ltable_cols], key=ltable.get_key()) r_mtable = MTable(rtable[rtable_cols], key=rtable.get_key()) return l_mtable, r_mtable
def debug_blocker(ltable, rtable, candidate_set, output_size=200, attr_corres=None): """ Debug the blocker. The basic idea is trying to suggest the user a list of record pairs out of the candidate set with high (document) jaccard similarity. The object of similarity measurement (document) is generated based on a string concatenation method with field selection. Given the suggestion list, the user should go through the pairs and determine if there are, and how many true matches in it. And based on this information, the user can determine if further improvement on the blocking step is necessary (Ex. if there are many true matches in the list, the user may conclude that the blocking step is flawed, and should revise it to produce a better candidate set). Parameters ---------- ltable, rtable : MTable Input MTables candidate_set : MTable The candidate set table after performing blocking on ltable and rtable pred_list_size : int The size of the output suggestion list field_corres_list : list (of tuples), defaults to None The list of field pairs from ltable and rtable. Each pair indicates a field correspondence between two tables. Since ltable and rtable can have different schemas, it' necessary to have this parameter to build the field correspondence to make sure the string concatenation algorithm runs correctly. Note each pair in the list should be represented as a tuple in the following format: (some_ltable_field, some_rtable_field) Returns ------- suggestion_table : MTable Contains a list of pair suggestions with high jaccard similarity. The output MTable contains the following fields: * _id * similarity (of the record pair) * ltable record key value * rtable record key value * field pairs from filtered corres_list (removing the numeric types) ltable_field_1 rtable_field_1 (corresponding to ltable_field_1) ltable_field_2 rtable_field_2 (corresponding to ltable_field_2) . . ltable_field_k rtable_field_k (corresponding to ltable_field_k) """ # Basic checks. if len(ltable) == 0: raise StandardError('Error: ltable is empty!') if len(rtable) == 0: raise StandardError('Error: rtable is empty!') if output_size <= 0: raise StandardError( 'The input parameter: \'pred_list_size\' is less than or equal to 0. Nothing needs to be done!' ) # logging.info('\nPreparing for debugging blocker') # Check the user input field correst list (if exists) and get the raw version of # our internal correst list. check_input_field_correspondence_list(ltable, rtable, attr_corres) corres_list = get_field_correspondence_list(ltable, rtable, attr_corres) # Build the (col_name: col_index) dict to speed up locating a field in the schema. ltable_col_dict = build_col_name_index_dict(ltable) rtable_col_dict = build_col_name_index_dict(rtable) # Filter correspondence list to remove numeric types. We only consider string types # for document concatenation. filter_corres_list(ltable, rtable, ltable_col_dict, rtable_col_dict, corres_list) #print('\nFiltered field correspondence list:\n' + str(corres_list)) # Get field filtered new table. ltable_filtered, rtable_filtered = get_filtered_table( ltable, rtable, corres_list) # Select features. """TODO: currently we don't select the key fields even if they have the largest score. # This is because the values of the key field could be simply domain-specific serial numbers, # which might be meaningless or even harmful (when two tables use different key formats). # Modify it if this ituition is not proper.""" feature_list = select_features(ltable_filtered, rtable_filtered) if len(feature_list) == 0: raise StandardError( '\nError: the selected field list is empty, nothing could be done! ' + 'Please check if all table fields are numeric types.') #print('\nSelected fields for concatenation:\n' + str([(ltable_filtered.columns[i], rtable_filtered.columns[i]) for i in feature_list])) # Get each table kgram dict. ltable_kgram_dict = get_kgram_dict(ltable_filtered, ltable_filtered.get_key(), feature_list, 3) rtable_kgram_dict = get_kgram_dict(rtable_filtered, rtable_filtered.get_key(), feature_list, 3) # Build inverted index on ltable kgrams to speed up debugging. inverted_index = build_inverted_index(ltable_kgram_dict) ltable_key = candidate_set.get_property('foreign_key_ltable') rtable_key = candidate_set.get_property('foreign_key_rtable') indexed_candidate_set = candidate_set.set_index([rtable_key, ltable_key], drop=False) candidate_index_key_set = set(indexed_candidate_set[rtable_key]) #print('\nCandidate set size: %d' %(len(indexed_candidate_set))) rtable_len = len(rtable_filtered) progress_dict = {} for i in range(10): progress_dict[int( (i + 1) * 1.0 / 10 * rtable_len)] = (i + 1) * 1.0 / 10 #print('\nStart debugging blocker') # print('Start debugging blocker') pred_index_list = [] count = 0 if mg._verbose: count_ = 0 per_count = math.ceil(mg._percent / 100.0 * len(rtable)) elif mg._progbar: bar = pyprind.ProgBar(len(rtable)) for rkey in rtable_kgram_dict: count += 1 #if count == 500: #print pred_index_list # break # if rtable_len <= 10: # print('Debugging %s' %('{percent:.2%}'.format(percent=count * 1.0 / rtable_len))) # else: # if count in progress_dict: # print('Debugging %s' %('{percent:.2%}'.format(percent=progress_dict[count]))) if mg._verbose: count_ += 1 if count_ % per_count == 0: print str( mg._percent * count_ / per_count) + ' percentage done !!!' elif mg._progbar: bar.update() rkgram_set = rtable_kgram_dict[rkey] if len(rkgram_set) == 0: continue cand_set = {} if rkey in candidate_index_key_set: cand_set = indexed_candidate_set.ix[rkey].index.values ltable_index_set = get_potential_match_set(rkgram_set, inverted_index) for lkey in ltable_index_set: if lkey in cand_set: continue jac_sim = jaccard_kgram_sim(ltable_kgram_dict[lkey], rkgram_set) if len(pred_index_list) == output_size: hq.heappushpop(pred_index_list, (jac_sim, lkey, rkey)) else: hq.heappush(pred_index_list, (jac_sim, lkey, rkey)) ret_data_frame = generate_prediction_table(ltable_filtered, rtable_filtered, pred_index_list) # """This print is for debugging""" #print ret_data_frame ret_mtable = MTable(ret_data_frame) ret_mtable.set_property('foreign_key_ltable', ltable_key) ret_mtable.set_property('foreign_key_rtable', rtable_key) ret_mtable.set_property('ltable', ltable) ret_mtable.set_property('rtable', rtable) return ret_mtable