def filter_pair(self, lstring, rstring):
        """Filter two strings with prefix filter.

        Args:
        lstring, rstring : input strings

        Returns:
        result : boolean, True if the tuple pair is dropped.
        """
        # check for empty string
        if (not lstring) or (not rstring):
            return True

        ltokens = tokenize(lstring, self.tokenizer, self.sim_measure_type)
        rtokens = tokenize(rstring, self.tokenizer, self.sim_measure_type)

        token_ordering = gen_token_ordering_for_lists([ltokens, rtokens])
        ordered_ltokens = order_using_token_ordering(ltokens, token_ordering)
        ordered_rtokens = order_using_token_ordering(rtokens, token_ordering)

        l_prefix_length = get_prefix_length(len(ordered_ltokens),
                                            self.sim_measure_type,
                                            self.threshold,
                                            self.tokenizer) 
        r_prefix_length = get_prefix_length(len(ordered_rtokens),
                                            self.sim_measure_type,
                                            self.threshold,
                                            self.tokenizer)
        prefix_overlap = set(ordered_ltokens[0:l_prefix_length]).intersection(
                         set(ordered_rtokens[0:r_prefix_length]))

        if len(prefix_overlap) > 0:
            return False
        else:
            return True
    def filter_pair(self, lstring, rstring):
        """Filter two strings with size filter.

        Args:
        lstring, rstring : input strings

        Returns:
        result : boolean, True if the tuple pair is dropped.
        """
        # check for empty string
        if (not lstring) or (not rstring):
            return True

        l_num_tokens = len(
            tokenize(lstring, self.tokenizer, self.sim_measure_type))
        r_num_tokens = len(
            tokenize(rstring, self.tokenizer, self.sim_measure_type))

        size_lower_bound = get_size_lower_bound(l_num_tokens,
                                                self.sim_measure_type,
                                                self.threshold)
        size_upper_bound = get_size_upper_bound(l_num_tokens,
                                                self.sim_measure_type,
                                                self.threshold)

        if size_lower_bound <= r_num_tokens <= size_upper_bound:
            return False
        else:
            return True
    def filter_pair(self, lstring, rstring):
        """Filter two strings with suffix filter.

        Args:
        lstring, rstring : input strings

        Returns:
        result : boolean, True if the tuple pair is dropped.
        """
        # check for empty string
        if (not lstring) or (not rstring):
            return True

        ltokens = tokenize(lstring, self.tokenizer, self.sim_measure_type)
        rtokens = tokenize(rstring, self.tokenizer, self.sim_measure_type)

        token_ordering = gen_token_ordering_for_lists([ltokens, rtokens])
        ordered_ltokens = order_using_token_ordering(ltokens, token_ordering)
        ordered_rtokens = order_using_token_ordering(rtokens, token_ordering)

        l_num_tokens = len(ordered_ltokens)
        r_num_tokens = len(ordered_rtokens)
        l_prefix_length = get_prefix_length(l_num_tokens,
                                            self.sim_measure_type,
                                            self.threshold, self.tokenizer)
        r_prefix_length = get_prefix_length(r_num_tokens,
                                            self.sim_measure_type,
                                            self.threshold, self.tokenizer)
        return self._filter_suffix(ordered_ltokens[l_prefix_length:],
                                   ordered_rtokens[r_prefix_length:],
                                   l_prefix_length, r_prefix_length,
                                   len(ltokens), len(rtokens))
Beispiel #4
0
    def filter_pair(self, lstring, rstring):
        """Filter two strings with position filter.

        Args:
        lstring, rstring : input strings

        Returns:
        result : boolean, True if the tuple pair is dropped.
        """
        # check for empty string
        if (not lstring) or (not rstring):
            return True

        ltokens = tokenize(lstring, self.tokenizer, self.sim_measure_type)
        rtokens = tokenize(rstring, self.tokenizer, self.sim_measure_type)

        token_ordering = gen_token_ordering_for_lists([ltokens, rtokens])
        ordered_ltokens = order_using_token_ordering(ltokens, token_ordering)
        ordered_rtokens = order_using_token_ordering(rtokens, token_ordering)

        l_num_tokens = len(ordered_ltokens)
        r_num_tokens = len(ordered_rtokens)

        l_prefix_length = get_prefix_length(l_num_tokens,
                                            self.sim_measure_type,
                                            self.threshold,
                                            self.tokenizer) 
        r_prefix_length = get_prefix_length(r_num_tokens,
                                            self.sim_measure_type,
                                            self.threshold,
                                            self.tokenizer)
 
        l_prefix_dict = {}
        l_pos = 0
        for token in ordered_ltokens[0:l_prefix_length]:
            l_prefix_dict[token] = l_pos

        overlap_threshold = get_overlap_threshold(l_num_tokens, r_num_tokens,
                                                  self.sim_measure_type,
                                                  self.threshold,
                                                  self.tokenizer)
        current_overlap = 0
        r_pos = 0 
        for token in ordered_rtokens[0:r_prefix_length]:
            l_pos = l_prefix_dict.get(token)
            if l_pos is not None:
                overlap_upper_bound = 1 + min(l_num_tokens - l_pos - 1,
                                              r_num_tokens - r_pos - 1)
                if (current_overlap + overlap_upper_bound) < overlap_threshold:
                    return True
                current_overlap += 1
            r_pos += 1

        if current_overlap > 0:
            return False
        return True
Beispiel #5
0
    def build(self):
        for row in self.table:
            index_string = str(row[self.index_attr])
            # check for empty string
            if not index_string:
                continue
            index_attr_tokens = order_using_token_ordering(tokenize(
                                        index_string,
                                        self.tokenizer,
                                        self.sim_measure_type),
                                    self.token_ordering)
            num_tokens = len(index_attr_tokens)
            prefix_length = get_prefix_length(
                                num_tokens,
                                self.sim_measure_type, self.threshold,
                                self.tokenizer)
 
            row_id = row[self.key_attr]
            pos = 0
            for token in index_attr_tokens[0:prefix_length]:
                if self.index.get(token) is None:
                    self.index[token] = []
                self.index.get(token).append((row_id, pos))
                pos += 1

            self.size_map[row_id] = num_tokens

        return True
Beispiel #6
0
    def filter_pair(self, lstring, rstring):
        """Filter two strings with overlap filter.

        Args:
        lstring, rstring : input strings

        Returns:
        result : boolean, True if the tuple pair is dropped.
        """
        # check for empty string
        if (not lstring) or (not rstring):
            return True

        ltokens = tokenize(lstring, self.tokenizer)
        rtokens = tokenize(rstring, self.tokenizer)

        num_overlap = overlap(ltokens, rtokens)

        if num_overlap < self.overlap_size:
            return True
        else:
            return False
Beispiel #7
0
    def build(self):
        for row in self.table:
            index_string = str(row[self.index_attr])
            # check for empty string
            if not index_string:
                continue
            index_attr_tokens = tokenize(index_string, self.tokenizer)

            row_id = row[self.key_attr]
            for token in index_attr_tokens:
                if self.index.get(token) is None:
                    self.index[token] = []
                self.index.get(token).append(row_id)

        return True
    def build(self):
        for row in self.table:
            index_string = str(row[self.index_attr])
            # check for empty string
            if not index_string:
                continue
            num_tokens = len(tokenize(index_string, self.tokenizer))

            if self.index.get(num_tokens) is None:
                self.index[num_tokens] = []

            self.index.get(num_tokens).append(row[self.key_attr])

            if num_tokens < self.min_length:
                self.min_length = num_tokens

            if num_tokens > self.max_length:
                self.max_length = num_tokens

        return True
def gen_token_ordering_for_tables(table_list,
                                  attr_list,
                                  tokenizer,
                                  sim_measure_type='OVERLAP'):
    token_freq_dict = {}
    table_index = 0
    for table in table_list:
        for row in table:
            for token in tokenize(str(row[attr_list[table_index]]), tokenizer,
                                  sim_measure_type):
                token_freq_dict[token] = token_freq_dict.get(token, 0) + 1
        table_index += 1

    ordered_tokens = sorted(list(token_freq_dict.items()), key=itemgetter(0))

    token_ordering = {}
    order_idx = 1
    for token_freq_tuple in sorted(ordered_tokens, key=itemgetter(1)):
        token_ordering[token_freq_tuple[0]] = order_idx
        order_idx += 1

    return token_ordering
def _filter_tables_split(ltable, rtable, l_key_attr, r_key_attr, l_filter_attr,
                         r_filter_attr, size_filter, l_out_attrs, r_out_attrs,
                         l_out_prefix, r_out_prefix):
    # find column indices of key attr, filter attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_filter_attr_index = l_columns.index(l_filter_attr)
    l_out_attrs_indices = []
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, filter attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_filter_attr_index = r_columns.index(r_filter_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_filter_attr_index)

    # build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_filter_attr_index)

    # Build size index over ltable
    size_index = SizeIndex(ltable_dict.values(), l_key_attr_index,
                           l_filter_attr_index, size_filter.tokenizer)
    size_index.build()

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)
    prog_bar = pyprind.ProgBar(len(rtable))

    for r_row in rtable_dict.values():
        r_id = r_row[r_key_attr_index]
        r_string = str(r_row[r_filter_attr_index])
        # check for empty string
        if not r_string:
            continue
        r_num_tokens = len(
            tokenize(r_string, size_filter.tokenizer,
                     size_filter.sim_measure_type))

        size_lower_bound = get_size_lower_bound(r_num_tokens,
                                                size_filter.sim_measure_type,
                                                size_filter.threshold)
        size_upper_bound = get_size_upper_bound(r_num_tokens,
                                                size_filter.sim_measure_type,
                                                size_filter.threshold)

        size_lower_bound = (size_index.min_length
                            if size_lower_bound < size_index.min_length else
                            size_lower_bound)

        size_upper_bound = (size_index.max_length
                            if size_upper_bound > size_index.max_length else
                            size_upper_bound)

        # probe size index and find candidates
        candidates = _find_candidates(size_lower_bound, size_upper_bound,
                                      size_index)

        for cand in candidates:
            if has_output_attributes:
                output_row = get_output_row_from_tables(
                    ltable_dict[cand], r_row, cand, r_id, l_out_attrs_indices,
                    r_out_attrs_indices)
                output_rows.append(output_row)
            else:
                output_rows.append([cand, r_id])

        prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)

    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
def _filter_tables_split(ltable, rtable,
                         l_key_attr, r_key_attr,
                         l_filter_attr, r_filter_attr,
                         prefix_filter,
                         l_out_attrs, r_out_attrs,
                         l_out_prefix, r_out_prefix):
    # find column indices of key attr, filter attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_filter_attr_index = l_columns.index(l_filter_attr)
    l_out_attrs_indices = []
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, filter attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_filter_attr_index = r_columns.index(r_filter_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_filter_attr_index)

    # build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_filter_attr_index)

    # generate token ordering using tokens in l_filter_attr and r_filter_attr
    token_ordering = gen_token_ordering_for_tables(
                                 [ltable_dict.values(), rtable_dict.values()],
                                 [l_filter_attr_index, r_filter_attr_index],
                                 prefix_filter.tokenizer,
                                 prefix_filter.sim_measure_type)

    # Build prefix index on l_filter_attr
    prefix_index = PrefixIndex(ltable_dict.values(), l_key_attr_index,
                               l_filter_attr_index, prefix_filter.tokenizer,
                               prefix_filter.sim_measure_type,
                               prefix_filter.threshold, token_ordering)
    prefix_index.build()

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)
    prog_bar = pyprind.ProgBar(len(rtable))

    for r_row in rtable_dict.values():
        r_id = r_row[r_key_attr_index]
        r_string = str(r_row[r_filter_attr_index])
        # check for empty string
        if not r_string:
            continue
        r_filter_attr_tokens = tokenize(r_string,
                                        prefix_filter.tokenizer,
                                        prefix_filter.sim_measure_type)
        r_ordered_tokens = order_using_token_ordering(r_filter_attr_tokens,
                                                      token_ordering)
           
        # probe prefix index and find candidates
        candidates = _find_candidates(r_ordered_tokens, len(r_ordered_tokens),
                                      prefix_filter, prefix_index)

        for cand in candidates:
            if has_output_attributes:
                output_row = get_output_row_from_tables(
                                     ltable_dict[cand], r_row,
                                     cand, r_id, 
                                     l_out_attrs_indices, r_out_attrs_indices)
                output_rows.append(output_row)
            else:
                output_rows.append([cand, r_id])
 
        prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs, 
                                                  l_out_prefix, r_out_prefix)

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
Beispiel #12
0
def _set_sim_join_split(ltable, rtable,
                        l_key_attr, r_key_attr,
                        l_join_attr, r_join_attr,
                        tokenizer,
                        sim_measure_type,
                        threshold,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix,
                        out_sim_score):
    """Perform set similarity join for a split of ltable and rtable"""

    # find column indices of key attr, join attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_join_attr_index)

    # build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_join_attr_index)

    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
                         [ltable_dict.values(),
                          rtable_dict.values()],
                         [l_join_attr_index,
                          r_join_attr_index],
                         tokenizer, sim_measure_type)

    # build a dictionary of tokenized l_join_attr
    l_join_attr_dict = {}
    for row in ltable_dict.values():
        l_join_attr_dict[row[l_key_attr_index]] = order_using_token_ordering(
            tokenize(str(row[l_join_attr_index]), tokenizer, sim_measure_type),
                                                  token_ordering)

    # Build position index on l_join_attr
    position_index = PositionIndex(ltable_dict.values(),
                                   l_key_attr_index, l_join_attr_index,
                                   tokenizer, sim_measure_type,
                                   threshold, token_ordering)
    position_index.build()

    pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold)
    suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold)
    sim_fn = get_sim_function(sim_measure_type)
    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)
    prog_bar = pyprind.ProgBar(len(rtable_dict.keys()))

    for r_row in rtable_dict.values():
        r_id = r_row[r_key_attr_index]
        r_string = str(r_row[r_join_attr_index])
        # check for empty string
        if not r_string:
            continue
        r_join_attr_tokens = tokenize(r_string, tokenizer, sim_measure_type)
        r_ordered_tokens = order_using_token_ordering(r_join_attr_tokens,
                                                      token_ordering)
        r_num_tokens = len(r_ordered_tokens)
        r_prefix_length = get_prefix_length(r_num_tokens,
                                            sim_measure_type,
                                            threshold, tokenizer)     

        candidate_overlap = find_candidates_position_filter(
                                r_ordered_tokens, r_num_tokens, r_prefix_length,
                                pos_filter, position_index)
        for cand, overlap in iteritems(candidate_overlap):
            if overlap > 0:
                l_ordered_tokens = l_join_attr_dict[cand]
                l_num_tokens = position_index.get_size(cand)
                l_prefix_length = get_prefix_length(
                                      l_num_tokens,
                                      sim_measure_type,
                                      threshold, tokenizer)
                if not suffix_filter._filter_suffix(
                           l_ordered_tokens[l_prefix_length:],
                           r_ordered_tokens[r_prefix_length:],
                           l_prefix_length,
                           r_prefix_length,
                           l_num_tokens, r_num_tokens):
                    sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens)
                    if sim_score >= threshold:
                        if has_output_attributes:
                            output_row = get_output_row_from_tables(
                                             ltable_dict[cand], r_row,
                                             cand, r_id,
                                             l_out_attrs_indices,
                                             r_out_attrs_indices)
                            if out_sim_score:
                                output_row.append(sim_score)
                            output_rows.append(output_row)
                        else:
                            output_row = [cand, r_id]
                            if out_sim_score:
                                output_row.append(sim_score)
                            output_rows.append(output_row)
        prog_bar.update()

    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
Beispiel #13
0
def _edit_dist_join_split(ltable, rtable,
                          l_key_attr, r_key_attr,
                          l_join_attr, r_join_attr,
                          tokenizer,
                          threshold,
                          l_out_attrs, r_out_attrs,
                          l_out_prefix, r_out_prefix,
                          out_sim_score):
    # find column indices of key attr, join attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_join_attr_index)

    # build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_join_attr_index)

    sim_measure_type = 'EDIT_DISTANCE'
    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
                         [ltable_dict.values(),
                          rtable_dict.values()],
                         [l_join_attr_index,
                          r_join_attr_index],
                         tokenizer, sim_measure_type)

    # build a dictionary of l_join_attr lengths
    l_join_attr_dict = {}
    for row in ltable_dict.values():
        l_join_attr_dict[row[l_key_attr_index]] = len(str(
                                                      row[l_join_attr_index]))

    # Build prefix index on l_join_attr
    prefix_index = PrefixIndex(ltable_dict.values(),
                               l_key_attr_index, l_join_attr_index,
                               tokenizer, sim_measure_type, threshold,
                               token_ordering)
    prefix_index.build()

    prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold)
    sim_fn = get_sim_function(sim_measure_type)
    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)
    prog_bar = pyprind.ProgBar(len(rtable_dict.keys()))

    for r_row in rtable_dict.values():
        r_id = r_row[r_key_attr_index]
        r_string = str(r_row[r_join_attr_index])
        r_len = len(r_string)
        # check for empty string
        if not r_string:
            continue
        r_join_attr_tokens = tokenize(r_string, tokenizer, sim_measure_type)
        r_ordered_tokens = order_using_token_ordering(r_join_attr_tokens,
                                                      token_ordering)
        candidates = find_candidates_prefix_filter(
                         r_ordered_tokens, len(r_ordered_tokens),
                         prefix_filter, prefix_index) 
        for cand in candidates:
            if r_len - threshold <= l_join_attr_dict[cand] <= r_len + threshold:
                edit_dist = sim_fn(str(ltable_dict[cand][l_join_attr_index]),
                                   r_string)
                if edit_dist <= threshold:
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                                         ltable_dict[cand], r_row,
                                         cand, r_id,
                                         l_out_attrs_indices,
                                         r_out_attrs_indices)
                        if out_sim_score:
                            output_row.append(edit_dist)
                        output_rows.append(output_row)
                    else:
                        output_row = [cand, r_id]
                        if out_sim_score:
                            output_row.append(edit_dist)
                        output_rows.append(output_row)

        prog_bar.update()

    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
Beispiel #14
0
def _filter_tables_split(ltable, rtable, l_key_attr, r_key_attr, l_filter_attr,
                         r_filter_attr, overlap_filter, l_out_attrs,
                         r_out_attrs, l_out_prefix, r_out_prefix,
                         out_sim_score):
    # Find column indices of key attr, filter attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_filter_attr_index = l_columns.index(l_filter_attr)
    l_out_attrs_indices = []
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # Find column indices of key attr, filter attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_filter_attr_index = r_columns.index(r_filter_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # Build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_filter_attr_index)

    # Build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_filter_attr_index)

    # Build inverted index over ltable
    inverted_index = InvertedIndex(ltable_dict.values(), l_key_attr_index,
                                   l_filter_attr_index,
                                   overlap_filter.tokenizer)
    inverted_index.build()

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)
    prog_bar = pyprind.ProgBar(len(rtable))

    for r_row in rtable_dict.values():
        r_id = r_row[r_key_attr_index]
        r_string = str(r_row[r_filter_attr_index])
        # check for empty string
        if not r_string:
            continue
        r_filter_attr_tokens = tokenize(r_string, overlap_filter.tokenizer)

        # probe inverted index and find overlap of candidates
        candidate_overlap = _find_candidates(r_filter_attr_tokens,
                                             inverted_index)

        for cand, overlap in iteritems(candidate_overlap):
            if overlap >= overlap_filter.overlap_size:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                        ltable_dict[cand], r_row, cand, r_id,
                        l_out_attrs_indices, r_out_attrs_indices)
                    if out_sim_score:
                        output_row.append(overlap)
                    output_rows.append(output_row)
                else:
                    output_row = [cand, r_id]
                    if out_sim_score:
                        output_row.append(overlap)
                    output_rows.append(output_row)

        prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
Beispiel #15
0
def test_valid_join(scenario, sim_measure_type, args):
    (ltable_path, l_key_attr, l_join_attr) = scenario[0]
    (rtable_path, r_key_attr, r_join_attr) = scenario[1]
    join_fn = JOIN_FN_MAP[sim_measure_type]

    # load input tables for the tests.
    ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path))
    rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path))

    # generate cartesian product to be used as candset
    ltable['tmp_join_key'] = 1
    rtable['tmp_join_key'] = 1
    cartprod = pd.merge(ltable[[l_key_attr, l_join_attr, 'tmp_join_key']],
                        rtable[[r_key_attr, r_join_attr, 'tmp_join_key']],
                        on='tmp_join_key').drop('tmp_join_key', 1)
    ltable.drop('tmp_join_key', 1)
    rtable.drop('tmp_join_key', 1)

    sim_func = get_sim_function(sim_measure_type)

    # apply sim function to the entire cartesian product to obtain
    # the expected set of pairs satisfying the threshold.
    cartprod['sim_score'] = cartprod.apply(lambda row: sim_func(
        tokenize(str(row[l_join_attr]), args[0], sim_measure_type),
        tokenize(str(row[r_join_attr]), args[0], sim_measure_type)),
                                           axis=1)

    expected_pairs = set()
    for idx, row in cartprod.iterrows():
        if float(row['sim_score']) >= args[1]:
            expected_pairs.add(','.join(
                (str(row[l_key_attr]), str(row[r_key_attr]))))

    # use join function to obtain actual output pairs.
    actual_candset = join_fn(ltable, rtable, l_key_attr, r_key_attr,
                             l_join_attr, r_join_attr, *args)

    expected_output_attrs = ['_id']
    l_out_prefix = DEFAULT_L_OUT_PREFIX
    r_out_prefix = DEFAULT_R_OUT_PREFIX

    # Check for l_out_prefix in args.
    if len(args) > 4:
        l_out_prefix = args[4]
    expected_output_attrs.append(l_out_prefix + l_key_attr)

    # Check for l_out_attrs in args.
    if len(args) > 2:
        if args[2]:
            for attr in args[2]:
                expected_output_attrs.append(l_out_prefix + attr)

    # Check for r_out_prefix in args.
    if len(args) > 5:
        r_out_prefix = args[5]
    expected_output_attrs.append(r_out_prefix + r_key_attr)

    # Check for r_out_attrs in args.
    if len(args) > 3:
        if args[3]:
            for attr in args[3]:
                expected_output_attrs.append(r_out_prefix + attr)

    # Check for out_sim_score in args.
    if len(args) > 6:
        if args[6]:
            expected_output_attrs.append('_sim_score')
    else:
        expected_output_attrs.append('_sim_score')

    # verify whether the output table has the necessary attributes.
    assert_list_equal(list(actual_candset.columns.values),
                      expected_output_attrs)

    actual_pairs = set()
    for idx, row in actual_candset.iterrows():
        actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]),
                                   str(row[r_out_prefix + r_key_attr]))))

    # verify whether the actual pairs and the expected pairs match.
    assert_equal(len(expected_pairs), len(actual_pairs))
    common_pairs = actual_pairs.intersection(expected_pairs)
    assert_equal(len(common_pairs), len(expected_pairs))
    def filter_tables(self,
                      ltable,
                      rtable,
                      l_key_attr,
                      r_key_attr,
                      l_filter_attr,
                      r_filter_attr,
                      l_out_attrs=None,
                      r_out_attrs=None,
                      l_out_prefix='l_',
                      r_out_prefix='r_'):
        """Filter tables with suffix filter.

        Args:
        ltable, rtable : Pandas data frame
        l_key_attr, r_key_attr : String, key attribute from ltable and rtable
        l_filter_attr, r_filter_attr : String, filter attribute from ltable and rtable
        l_out_attrs, r_out_attrs : list of attribtues to be included in the output table from ltable and rtable
        l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table 

        Returns:
        result : Pandas data frame
        """
        # check if the input tables are dataframes
        validate_input_table(ltable, 'left table')
        validate_input_table(rtable, 'right table')

        # check if the key attributes and filter attributes exist
        validate_attr(l_key_attr, ltable.columns, 'key attribute',
                      'left table')
        validate_attr(r_key_attr, rtable.columns, 'key attribute',
                      'right table')
        validate_attr(l_filter_attr, ltable.columns, 'filter attribute',
                      'left table')
        validate_attr(r_filter_attr, rtable.columns, 'filter attribute',
                      'right table')

        # check if the output attributes exist
        validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs,
                              rtable.columns)

        # check if the key attributes are unique and do not contain missing values
        validate_key_attr(l_key_attr, ltable, 'left table')
        validate_key_attr(r_key_attr, rtable, 'right table')

        # find column indices of key attr, filter attr and
        # output attrs in ltable
        l_columns = list(ltable.columns.values)
        l_key_attr_index = l_columns.index(l_key_attr)
        l_filter_attr_index = l_columns.index(l_filter_attr)
        l_out_attrs_indices = find_output_attribute_indices(
            l_columns, l_out_attrs)

        # find column indices of key attr, filter attr and
        # output attrs in rtable
        r_columns = list(rtable.columns.values)
        r_key_attr_index = r_columns.index(r_key_attr)
        r_filter_attr_index = r_columns.index(r_filter_attr)
        r_out_attrs_indices = find_output_attribute_indices(
            r_columns, r_out_attrs)

        # build a dictionary on ltable
        ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                            l_filter_attr_index)

        # build a dictionary on rtable
        rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                            r_filter_attr_index)

        # generate token ordering using tokens in l_filter_attr
        # and r_filter_attr
        token_ordering = gen_token_ordering_for_tables(
            [ltable_dict.values(), rtable_dict.values()],
            [l_filter_attr_index, r_filter_attr_index], self.tokenizer,
            self.sim_measure_type)

        output_rows = []
        has_output_attributes = (l_out_attrs is not None
                                 or r_out_attrs is not None)
        prog_bar = pyprind.ProgBar(len(ltable))

        for l_row in ltable_dict.values():
            l_id = l_row[l_key_attr_index]
            l_string = str(l_row[l_filter_attr_index])
            # check for empty string
            if not l_string:
                continue
            ltokens = tokenize(l_string, self.tokenizer, self.sim_measure_type)
            ordered_ltokens = order_using_token_ordering(
                ltokens, token_ordering)
            l_num_tokens = len(ordered_ltokens)
            l_prefix_length = get_prefix_length(l_num_tokens,
                                                self.sim_measure_type,
                                                self.threshold, self.tokenizer)
            l_suffix = ordered_ltokens[l_prefix_length:]
            for r_row in rtable_dict.values():
                r_id = r_row[r_key_attr_index]
                r_string = str(r_row[r_filter_attr_index])
                # check for empty string
                if not r_string:
                    continue
                rtokens = tokenize(r_string, self.tokenizer,
                                   self.sim_measure_type)
                ordered_rtokens = order_using_token_ordering(
                    rtokens, token_ordering)
                r_num_tokens = len(ordered_rtokens)
                r_prefix_length = get_prefix_length(r_num_tokens,
                                                    self.sim_measure_type,
                                                    self.threshold,
                                                    self.tokenizer)
                if not self._filter_suffix(
                        l_suffix, ordered_rtokens[r_prefix_length:],
                        l_prefix_length, r_prefix_length, l_num_tokens,
                        r_num_tokens):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                            ltable_dict[l_id], r_row, l_id, r_id,
                            l_out_attrs_indices, r_out_attrs_indices)
                        output_rows.append(output_row)
                    else:
                        output_rows.append([l_id, r_id])

            prog_bar.update()

        output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                      l_out_attrs, r_out_attrs,
                                                      l_out_prefix,
                                                      r_out_prefix)

        # generate a dataframe from the list of output rows
        output_table = pd.DataFrame(output_rows, columns=output_header)
        output_table.insert(0, '_id', range(0, len(output_table)))
        return output_table