def block_candset(self, candset, ltable, rtable, fk_ltable, fk_rtable, l_key, r_key, nchunks=1, scheduler=threaded.get, num_workers=None, cache_size=1e9, compute=False, show_progress=True): candset_splitted = delayed(candsplit_df)(candset, nchunks) # l_proj_attrs = (get_lattrs_to_project)(l_key, self.ltable_attrs) # r_proj_attrs = (get_rattrs_to_project)(r_key, self.rtable_attrs) # # ltbl = (lproj_df)(ltable, l_proj_attrs) # rtbl = (rproj_df)(rtable, r_proj_attrs) results = [] for i in xrange(nchunks): result = delayed(self._block_candset_part)(candset_splitted[i], ltable, rtable, fk_ltable, fk_rtable, l_key, r_key) results.append(result) valid_candset = delayed(concat_df)(results) if compute: valid_candset = exec_dag(valid_candset, num_workers, cache_size, scheduler, show_progress) return valid_candset
def extract_feature_vecs(candset, ltable, rtable, key, fk_ltable, fk_rtable, l_key, r_key, attrs_before=None, feature_table=None, attrs_after=None, nchunks=1, scheduler=get, num_workers=None, cache_size=1e9, compute=False, show_progress=True): candset_splitted = delayed(candsplit_df)(candset, nchunks) results = [] for i in xrange(nchunks): result = delayed(_extract_feature_vecs_part)( candset_splitted[i], ltable, rtable, key, fk_ltable, fk_rtable, l_key, r_key, attrs_before, feature_table, attrs_after) results.append(result) feat_vecs = delayed(concat_df)(results) if compute: feat_vecs = exec_dag(feat_vecs, num_workers, cache_size, scheduler, show_progress) return feat_vecs
def predict(self, table, exclude_attrs=None, target_attr=None, append=False, inplace=True, nchunks=1, scheduler=threaded.get, num_workers=None, cache_size=1e9, compute=False, show_progress=True): candset_splitted = delayed(candsplit_df)(table, nchunks) results = [] for i in xrange(nchunks): result = delayed(self._predict_table_part)(candset_splitted[i], exclude_attrs, target_attr, append, inplace) results.append(result) feat_vecs = delayed(concat_df)(results) if compute: feat_vecs = exec_dag(feat_vecs, num_workers, cache_size, scheduler, show_progress) return feat_vecs pass
def block_tables(self, ltable, rtable, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None, l_output_prefix='l_', r_output_prefix='r_', nltable_chunks=1, nrtable_chunks=1, scheduler=threaded.get, num_workers=None, cache_size=1e9, compute=False, show_progress=True): # @todo validate inputs # @todo need to handle missing values. ltable_splitted = (lsplit_df)(ltable, nltable_chunks) rtable_splitted = (rsplit_df)(rtable, nrtable_chunks) # l_proj_attrs = (get_lattrs_to_project)(l_key, l_block_attr, l_output_attrs) # r_proj_attrs = (get_rattrs_to_project)(r_key, r_block_attr, r_output_attrs) # list ot accomodate results results = [] for i in xrange(nltable_chunks): # ltbl = (lproj_df)(ltable_splitted[i], l_proj_attrs) for j in xrange(nrtable_chunks): # rtbl = (rproj_df)(rtable_splitted[j], r_proj_attrs) res = delayed(self._block_table_part)( ltable_splitted[i], rtable_splitted[j], l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) results.append(res) candset = delayed(concat_df)(results) candset = delayed(add_id)(candset) if compute: candset = exec_dag(candset, num_workers, cache_size, scheduler, show_progress) return candset
def block_candset(self, candset, ltable, rtable, fk_ltable, fk_rtable, l_key, r_key, l_block_attr, r_block_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, nchunks=1, scheduler=threaded.get, num_workers=None, cache_size=1e9, compute=False, show_progress=True): cand_splitted = delayed(candsplit_df)(candset, nchunks) # l_proj_attrs = (get_lattrs_to_project)(l_key, l_block_attr) # r_proj_attrs = (get_rattrs_to_project)(r_key, r_block_attr) # # ltbl = (lproj_df)(ltable, l_proj_attrs) # rtbl = (rproj_df)(rtable, r_proj_attrs) if word_level == True: tokenizer = WhiteSpaceTokenizer() else: tokenizer = QgramTokenizer(qval=q_val) results = [] for i in xrange(nchunks): result = delayed(self._block_candset_part)( cand_splitted[i], ltable, rtable, fk_ltable, fk_rtable, l_key, r_key, l_block_attr, r_block_attr, rem_stop_words, tokenizer, overlap_size) results.append(result) valid_candset = delayed(concat_df)(results) if compute: valid_candset = exec_dag(valid_candset, num_workers, cache_size, scheduler, show_progress) return valid_candset
def block_tables(self, ltable, rtable, l_key, r_key, l_output_attrs=None, r_output_attrs=None, l_output_prefix='l_', r_output_prefix='r_', nltable_chunks=1, nrtable_chunks=1, scheduler=threaded.get, num_workers=None, cache_size=1e9, compute=False, show_progress=True): ltable_splitted = (lsplit_df)(ltable, nltable_chunks) rtable_splitted = (rsplit_df)(rtable, nrtable_chunks) l_proj_attrs = (get_lattrs_to_project)(l_key, self.ltable_attrs, l_output_attrs) # needs to be modified as self.ltable_attrs can be None. r_proj_attrs = (get_rattrs_to_project)(r_key, self.rtable_attrs, r_output_attrs) results = [] for i in xrange(nltable_chunks): # ltbl = (lproj_df)(ltable_splitted[i], l_proj_attrs) for j in xrange(nrtable_chunks): # rtbl = (rproj_df)(rtable_splitted[j], r_proj_attrs) result = delayed(self._block_tables_part)( ltable_splitted[i], rtable_splitted[j], l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) results.append(result) candset = delayed(concat_df)(results) candset = delayed(add_id)(candset) if compute: candset = exec_dag(candset, num_workers, cache_size, scheduler, show_progress) return candset
def select_matcher(matchers, x=None, y=None, table=None, exclude_attrs=None, target_attr=None, metric='precision', k=5, random_state=None, scheduler=threaded.get, num_workers=None, cache_size=1e9, compute=False, show_progress=True): x, y = _get_xy_data(x, y, table, exclude_attrs, target_attr) scores = [] for m in matchers: score = (cross_validation)(m, x, y, metric, k, random_state) scores.append(score) res = delayed(process_scores)(matchers, scores, k) if compute: res = exec_dag(res, num_workers, cache_size, scheduler, show_progress) return res
def block_tables(self, ltable, rtable, l_key, r_key, l_block_attr, r_block_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, l_output_attrs=None, r_output_attrs=None, l_output_prefix='l_', r_output_prefix='r_', nltable_chunks=1, nrtable_chunks=1, scheduler=threaded.get, num_workers=None, cache_size=1e9, compute=False, show_progress=True): # @todo: validations. ltable_splitted = (lsplit_df)(ltable, nltable_chunks) rtable_splitted = (rsplit_df)(rtable, nrtable_chunks) # l_proj_attrs = (get_lattrs_to_project)(l_key, l_block_attr, l_output_attrs) # r_proj_attrs = (get_rattrs_to_project)(r_key, r_block_attr, r_output_attrs) if word_level == True: tokenizer = WhiteSpaceTokenizer() else: tokenizer = QgramTokenizer(q_val=q_val) # ltokens = [] # lsplitted = delayed(split_df)(ltable, nlchunks) # for i in range(nlchunks): # lcat_strings = (delayed)(preprocess_table)(lsplitted[i], lid) # tokens = (delayed)(tokenize_strings_wsp)(lcat_strings, lstopwords) # ltokens.append(tokens) ltokens = [] for i in xrange(nltable_chunks): # def process_and_tokenize_ltable(self, ltable, l_key, l_block_attr, tokenizer, # rem_stop_words): tokens = (delayed)(self.process_and_tokenize_ltable)( ltable_splitted[i], l_key, l_block_attr, tokenizer, rem_stop_words) ltokens.append(tokens) inv_index = (delayed)(build_inv_index)(ltokens) results = [] # for i in xrange(nltable_chunks): # ltbl = (lproj_df)(ltable_splitted[i], l_proj_attrs) for j in xrange(nrtable_chunks): # rtbl = (rproj_df)(rtable_splitted[j], r_proj_attrs) result = delayed(self._block_table_part)( inv_index, ltable, rtable_splitted[j], l_key, r_key, l_block_attr, r_block_attr, tokenizer, overlap_size, rem_stop_words, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) results.append(result) candset = delayed(concat_df)(results) candset = delayed(add_id)(candset) if compute: candset = exec_dag(candset, num_workers, cache_size, scheduler, show_progress) return candset