Beispiel #1
0
def downsample_dbg(ltable,
                   rtable,
                   size,
                   y,
                   stopwords=[],
                   nchunks=1,
                   scheduler=threaded.get,
                   compute=True):
    ltokens = []
    for i in range(nlchunks):
        lcat_strings = preprocess_table(ltable)
        tokens = tokenize_strings_wsp(lcat_strings, stopwords)
        ltokens.append(tokens)

    invindex = build_inv_index(ltokens)

    rsample = sample(rtable, size)
    rsplitted = split_df(rsample, nrchunks)
    probe_rslts = []
    for i in range(nrchunks):
        rcat_strings = preprocess_table(rsplitted[i])
        rtokens = tokenize_strings_wsp(rcat_strings, stopwords)
        probe_rslt = probe(rtokens, invindex, y)
        probe_rslts.append(probe_rslt)

    sampled_tbls = postprocess(probe_rslts, ltable, rsample)

    return sampled_tbls
Beispiel #2
0
    def _block_table_part(self, ltable, rtable, l_key, r_key, l_block_attr,
                          r_block_attr, tokenizer, threshold, rem_stop_words,
                          l_output_attrs, r_output_attrs, l_output_prefix,
                          r_output_prefix):
        l_proj_attrs = (get_lattrs_to_project)(l_key, l_block_attr,
                                               l_output_attrs)
        r_proj_attrs = (get_rattrs_to_project)(r_key, r_block_attr,
                                               r_output_attrs)

        ltable = (lproj_df)(ltable, l_proj_attrs)
        rtable = (rproj_df)(rtable, r_proj_attrs)

        ltbl = ltable[~ltable[l_block_attr].isnull()]
        rtbl = rtable[~rtable[r_block_attr].isnull()]

        l_strings = self._preprocess_table(ltbl, l_key, l_block_attr,
                                           rem_stop_words)
        l_tokens = tokenize_strings(l_strings, tokenizer)
        inv_index = build_inv_index([l_tokens])

        r_strings = self._preprocess_table(rtbl, r_key, r_block_attr,
                                           rem_stop_words)
        r_tokens = tokenize_strings(r_strings, tokenizer)

        candset = self._probe(r_tokens, inv_index, threshold)
        fk_ltable, fk_rtable = l_output_prefix + l_key, r_output_prefix + r_key
        candset = pd.DataFrame(candset.get_pairids(),
                               columns=[fk_ltable, fk_rtable])
        candset = add_attributes(candset, ltbl, rtbl, fk_ltable, fk_rtable,
                                 l_key, r_key, l_output_attrs, r_output_attrs,
                                 l_output_prefix, r_output_prefix)
        if not isinstance(candset, pd.DataFrame):
            print('Returning {0}'.format(candset))

        return candset
Beispiel #3
0
def block_table_chunks(ldf, rdf, l_key, r_key, l_attr, r_attr, tokenizer,
                       threshold, stopwords, l_out, r_out, l_prefix, r_prefix):
    ldf = ldf[~ldf[l_attr].isnull()]
    rdf = rdf[~rdf[r_attr].isnull()]
    lstrings = preprocess_table(ldf, l_attr, l_key, stopwords)
    ltokens = tokenize_strings(lstrings, tokenizer)
    invindex = build_inv_index([ltokens])

    rstrings = preprocess_table(rdf, r_attr, r_key, stopwords)
    rtokens = tokenize_strings(rstrings, tokenizer)
    res = probe(rtokens, invindex, threshold)
    lcol, rcol = l_prefix + l_key, r_prefix + r_key
    res = pd.DataFrame(res.get_pairids(), columns=[lcol, rcol])
    if len(res):
        res = add_attributes(res, ldf, rdf, lcol, rcol, l_key, r_key, l_out,
                             r_out, l_prefix, r_prefix)
    return res
Beispiel #4
0
def downsample_sm(ltable,
                  rtable,
                  lid,
                  rid,
                  size,
                  y,
                  lstopwords=[],
                  rstopwords=[]):

    lcat_strings = preprocess_table(ltable, lid)
    ltokens = tokenize_strings_wsp(lcat_strings, lstopwords)
    invindex = build_inv_index([ltokens])

    # rsample = rtable.sample(size, replace=False)
    # rsample = rtable.head(size)
    rsample = sample(rtable, size)
    rcat_strings = preprocess_table(rsample, rid)
    rtokens = tokenize_strings_wsp(rcat_strings, rstopwords)

    probe_rslt = probe(rtokens, invindex, y)

    sampled_tbls = postprocess([probe_rslt], ltable, rsample)

    return sampled_tbls
Beispiel #5
0
def grid_search_overlap(input_tables,
                        params_command,
                        params_grid,
                        nbins=10,
                        do_cartesian=False,
                        repeat=1):
    ob = OverlapBlocker()
    command = ob.block_tables
    args = process_args(command, input_tables, params_command, params_grid)
    if not check_param_grid(params_grid, do_cartesian):
        raise ValueError('Check the parameter grid')
    sample_size = 0.1
    ltable, rtable = input_tables['ltable'], input_tables['rtable']
    ltable['id'] = list(range(len(ltable)))
    rtable['id'] = list(range(len(rtable)))
    if args['word_level'] == True:
        tokenizer = WhiteSpaceTokenizer()
    else:
        tokenizer = QgramTokenizer(qval=args['q_val'])
    lid = args['l_key']
    l_block_attr = args['l_block_attr']
    s_ltable = sample_ltable(ltable, lid, l_block_attr, nbins, sample_size)
    ob = OverlapBlocker()
    p = ob.process_and_tokenize_ltable(ltable, lid, l_block_attr, tokenizer,
                                       [])
    inv_index = build_inv_index([p])
    if args['word_level'] == True:
        tokenizer = WhiteSpaceTokenizer()
    else:
        tokenizer = QgramTokenizer(qval=args['q_val'])

    rid = args['r_key']
    r_block_attr = args['r_block_attr']

    s_rtable = sample_rtable(rtable, rid, r_block_attr, tokenizer, nbins,
                             sample_size, inv_index)
    args['ltable'] = s_ltable
    args['rtable'] = s_rtable

    # do staged tuning
    ltable, rtable = input_tables['ltable'], input_tables['rtable']
    copy_params_grid = deepcopy(params_grid)
    keys = params_grid.keys()
    copy_params_grid[keys[0]] = [1]

    config_setting = get_config_setting(copy_params_grid, do_cartesian)
    best_config, result = do_grid_search(command,
                                         args,
                                         params_grid.keys(),
                                         config_setting,
                                         repeat=repeat)
    print('best config after first stage: ' + str(best_config))
    print(result[best_config])
    b = best_config[1]
    copy_params_grid = deepcopy(params_grid)
    copy_params_grid[keys[1]] = [b]
    config_setting = get_config_setting(copy_params_grid, do_cartesian)
    best_config, result = do_grid_search(command,
                                         args,
                                         params_grid.keys(),
                                         config_setting,
                                         repeat=repeat)
    print('best_config: ' + str(best_config))
    return best_config, result