Ejemplo n.º 1
0
    def block_candset(self,
                      candset,
                      ltable,
                      rtable,
                      fk_ltable,
                      fk_rtable,
                      l_key,
                      r_key,
                      nchunks=1,
                      scheduler=threaded.get,
                      num_workers=None,
                      cache_size=1e9,
                      compute=False,
                      show_progress=True):
        candset_splitted = delayed(candsplit_df)(candset, nchunks)
        # l_proj_attrs = (get_lattrs_to_project)(l_key, self.ltable_attrs)
        # r_proj_attrs = (get_rattrs_to_project)(r_key, self.rtable_attrs)
        #
        # ltbl = (lproj_df)(ltable, l_proj_attrs)
        # rtbl = (rproj_df)(rtable, r_proj_attrs)

        results = []
        for i in xrange(nchunks):
            result = delayed(self._block_candset_part)(candset_splitted[i],
                                                       ltable, rtable,
                                                       fk_ltable, fk_rtable,
                                                       l_key, r_key)
            results.append(result)
        valid_candset = delayed(concat_df)(results)

        if compute:
            valid_candset = exec_dag(valid_candset, num_workers, cache_size,
                                     scheduler, show_progress)
        return valid_candset
Ejemplo n.º 2
0
def extract_feature_vecs(candset,
                         ltable,
                         rtable,
                         key,
                         fk_ltable,
                         fk_rtable,
                         l_key,
                         r_key,
                         attrs_before=None,
                         feature_table=None,
                         attrs_after=None,
                         nchunks=1,
                         scheduler=get,
                         num_workers=None,
                         cache_size=1e9,
                         compute=False,
                         show_progress=True):
    candset_splitted = delayed(candsplit_df)(candset, nchunks)
    results = []
    for i in xrange(nchunks):
        result = delayed(_extract_feature_vecs_part)(
            candset_splitted[i], ltable, rtable, key, fk_ltable, fk_rtable,
            l_key, r_key, attrs_before, feature_table, attrs_after)
        results.append(result)
    feat_vecs = delayed(concat_df)(results)

    if compute:
        feat_vecs = exec_dag(feat_vecs, num_workers, cache_size, scheduler,
                             show_progress)
    return feat_vecs
Ejemplo n.º 3
0
    def predict(self,
                table,
                exclude_attrs=None,
                target_attr=None,
                append=False,
                inplace=True,
                nchunks=1,
                scheduler=threaded.get,
                num_workers=None,
                cache_size=1e9,
                compute=False,
                show_progress=True):
        candset_splitted = delayed(candsplit_df)(table, nchunks)
        results = []
        for i in xrange(nchunks):
            result = delayed(self._predict_table_part)(candset_splitted[i],
                                                       exclude_attrs,
                                                       target_attr, append,
                                                       inplace)
            results.append(result)
        feat_vecs = delayed(concat_df)(results)

        if compute:
            feat_vecs = exec_dag(feat_vecs, num_workers, cache_size, scheduler,
                                 show_progress)
        return feat_vecs

        pass
Ejemplo n.º 4
0
    def block_tables(self,
                     ltable,
                     rtable,
                     l_key,
                     r_key,
                     l_block_attr,
                     r_block_attr,
                     l_output_attrs=None,
                     r_output_attrs=None,
                     l_output_prefix='l_',
                     r_output_prefix='r_',
                     nltable_chunks=1,
                     nrtable_chunks=1,
                     scheduler=threaded.get,
                     num_workers=None,
                     cache_size=1e9,
                     compute=False,
                     show_progress=True):
        # @todo validate inputs
        # @todo need to handle missing values.

        ltable_splitted = (lsplit_df)(ltable, nltable_chunks)
        rtable_splitted = (rsplit_df)(rtable, nrtable_chunks)

        # l_proj_attrs = (get_lattrs_to_project)(l_key, l_block_attr, l_output_attrs)
        # r_proj_attrs = (get_rattrs_to_project)(r_key, r_block_attr, r_output_attrs)

        # list ot accomodate results
        results = []
        for i in xrange(nltable_chunks):
            # ltbl = (lproj_df)(ltable_splitted[i], l_proj_attrs)
            for j in xrange(nrtable_chunks):
                # rtbl = (rproj_df)(rtable_splitted[j], r_proj_attrs)
                res = delayed(self._block_table_part)(
                    ltable_splitted[i], rtable_splitted[j], l_key, r_key,
                    l_block_attr, r_block_attr, l_output_attrs, r_output_attrs,
                    l_output_prefix, r_output_prefix)
                results.append(res)
        candset = delayed(concat_df)(results)
        candset = delayed(add_id)(candset)

        if compute:
            candset = exec_dag(candset, num_workers, cache_size, scheduler,
                               show_progress)

        return candset
Ejemplo n.º 5
0
    def block_candset(self,
                      candset,
                      ltable,
                      rtable,
                      fk_ltable,
                      fk_rtable,
                      l_key,
                      r_key,
                      l_block_attr,
                      r_block_attr,
                      rem_stop_words=False,
                      q_val=None,
                      word_level=True,
                      overlap_size=1,
                      nchunks=1,
                      scheduler=threaded.get,
                      num_workers=None,
                      cache_size=1e9,
                      compute=False,
                      show_progress=True):
        cand_splitted = delayed(candsplit_df)(candset, nchunks)

        # l_proj_attrs = (get_lattrs_to_project)(l_key, l_block_attr)
        # r_proj_attrs = (get_rattrs_to_project)(r_key, r_block_attr)
        #
        # ltbl = (lproj_df)(ltable, l_proj_attrs)
        # rtbl = (rproj_df)(rtable, r_proj_attrs)

        if word_level == True:
            tokenizer = WhiteSpaceTokenizer()
        else:
            tokenizer = QgramTokenizer(qval=q_val)

        results = []
        for i in xrange(nchunks):
            result = delayed(self._block_candset_part)(
                cand_splitted[i], ltable, rtable, fk_ltable, fk_rtable, l_key,
                r_key, l_block_attr, r_block_attr, rem_stop_words, tokenizer,
                overlap_size)
            results.append(result)

        valid_candset = delayed(concat_df)(results)
        if compute:
            valid_candset = exec_dag(valid_candset, num_workers, cache_size,
                                     scheduler, show_progress)
        return valid_candset
Ejemplo n.º 6
0
    def block_tables(self,
                     ltable,
                     rtable,
                     l_key,
                     r_key,
                     l_output_attrs=None,
                     r_output_attrs=None,
                     l_output_prefix='l_',
                     r_output_prefix='r_',
                     nltable_chunks=1,
                     nrtable_chunks=1,
                     scheduler=threaded.get,
                     num_workers=None,
                     cache_size=1e9,
                     compute=False,
                     show_progress=True):

        ltable_splitted = (lsplit_df)(ltable, nltable_chunks)
        rtable_splitted = (rsplit_df)(rtable, nrtable_chunks)

        l_proj_attrs = (get_lattrs_to_project)(l_key, self.ltable_attrs,
                                               l_output_attrs)
        # needs to be modified as self.ltable_attrs can be None.
        r_proj_attrs = (get_rattrs_to_project)(r_key, self.rtable_attrs,
                                               r_output_attrs)
        results = []
        for i in xrange(nltable_chunks):
            # ltbl = (lproj_df)(ltable_splitted[i], l_proj_attrs)
            for j in xrange(nrtable_chunks):
                # rtbl = (rproj_df)(rtable_splitted[j], r_proj_attrs)
                result = delayed(self._block_tables_part)(
                    ltable_splitted[i], rtable_splitted[j], l_key, r_key,
                    l_output_attrs, r_output_attrs, l_output_prefix,
                    r_output_prefix)
                results.append(result)
        candset = delayed(concat_df)(results)
        candset = delayed(add_id)(candset)

        if compute:
            candset = exec_dag(candset, num_workers, cache_size, scheduler,
                               show_progress)

        return candset
Ejemplo n.º 7
0
def select_matcher(matchers,
                   x=None,
                   y=None,
                   table=None,
                   exclude_attrs=None,
                   target_attr=None,
                   metric='precision',
                   k=5,
                   random_state=None,
                   scheduler=threaded.get,
                   num_workers=None,
                   cache_size=1e9,
                   compute=False,
                   show_progress=True):
    x, y = _get_xy_data(x, y, table, exclude_attrs, target_attr)
    scores = []
    for m in matchers:
        score = (cross_validation)(m, x, y, metric, k, random_state)
        scores.append(score)
    res = delayed(process_scores)(matchers, scores, k)
    if compute:
        res = exec_dag(res, num_workers, cache_size, scheduler, show_progress)
    return res
Ejemplo n.º 8
0
    def block_tables(self,
                     ltable,
                     rtable,
                     l_key,
                     r_key,
                     l_block_attr,
                     r_block_attr,
                     rem_stop_words=False,
                     q_val=None,
                     word_level=True,
                     overlap_size=1,
                     l_output_attrs=None,
                     r_output_attrs=None,
                     l_output_prefix='l_',
                     r_output_prefix='r_',
                     nltable_chunks=1,
                     nrtable_chunks=1,
                     scheduler=threaded.get,
                     num_workers=None,
                     cache_size=1e9,
                     compute=False,
                     show_progress=True):
        # @todo: validations.

        ltable_splitted = (lsplit_df)(ltable, nltable_chunks)
        rtable_splitted = (rsplit_df)(rtable, nrtable_chunks)

        # l_proj_attrs = (get_lattrs_to_project)(l_key, l_block_attr, l_output_attrs)
        # r_proj_attrs = (get_rattrs_to_project)(r_key, r_block_attr, r_output_attrs)

        if word_level == True:
            tokenizer = WhiteSpaceTokenizer()
        else:
            tokenizer = QgramTokenizer(q_val=q_val)

        # ltokens = []
        # lsplitted = delayed(split_df)(ltable, nlchunks)
        # for i in range(nlchunks):
        #     lcat_strings = (delayed)(preprocess_table)(lsplitted[i], lid)
        #     tokens = (delayed)(tokenize_strings_wsp)(lcat_strings, lstopwords)
        #     ltokens.append(tokens)
        ltokens = []
        for i in xrange(nltable_chunks):
            # def process_and_tokenize_ltable(self, ltable, l_key, l_block_attr, tokenizer,
            #                                 rem_stop_words):

            tokens = (delayed)(self.process_and_tokenize_ltable)(
                ltable_splitted[i], l_key, l_block_attr, tokenizer,
                rem_stop_words)
            ltokens.append(tokens)
        inv_index = (delayed)(build_inv_index)(ltokens)
        results = []
        # for i in xrange(nltable_chunks):
        # ltbl = (lproj_df)(ltable_splitted[i], l_proj_attrs)
        for j in xrange(nrtable_chunks):
            # rtbl = (rproj_df)(rtable_splitted[j], r_proj_attrs)
            result = delayed(self._block_table_part)(
                inv_index, ltable, rtable_splitted[j], l_key, r_key,
                l_block_attr, r_block_attr, tokenizer, overlap_size,
                rem_stop_words, l_output_attrs, r_output_attrs,
                l_output_prefix, r_output_prefix)
            results.append(result)
        candset = delayed(concat_df)(results)
        candset = delayed(add_id)(candset)

        if compute:
            candset = exec_dag(candset, num_workers, cache_size, scheduler,
                               show_progress)

        return candset