def filter_pair(self, lstring, rstring):
        """Filter two strings with prefix filter.

        Args:
        lstring, rstring : input strings

        Returns:
        result : boolean, True if the tuple pair is dropped.
        """
        # check for empty string
        if (not lstring) or (not rstring):
            return True

        ltokens = tokenize(lstring, self.tokenizer, self.sim_measure_type)
        rtokens = tokenize(rstring, self.tokenizer, self.sim_measure_type)

        token_ordering = gen_token_ordering_for_lists([ltokens, rtokens])
        ordered_ltokens = order_using_token_ordering(ltokens, token_ordering)
        ordered_rtokens = order_using_token_ordering(rtokens, token_ordering)

        l_prefix_length = get_prefix_length(len(ordered_ltokens),
                                            self.sim_measure_type,
                                            self.threshold,
                                            self.tokenizer) 
        r_prefix_length = get_prefix_length(len(ordered_rtokens),
                                            self.sim_measure_type,
                                            self.threshold,
                                            self.tokenizer)
        prefix_overlap = set(ordered_ltokens[0:l_prefix_length]).intersection(
                         set(ordered_rtokens[0:r_prefix_length]))

        if len(prefix_overlap) > 0:
            return False
        else:
            return True
Example #2
0
    def test_find_candidates(self):
        # test default case (presence of candidates)
        tokens = order_using_token_ordering(['aa', 'ef', 'ab', 'cd'],
                                            token_ordering)
        self.assertSetEqual(
            self.position_filter.find_candidates(tokens, len(tokens), 0.8),
            set([0, 3]))

        # test empty set of candidates
        tokens = order_using_token_ordering(['op', 'lp', 'mp'], token_ordering)
        self.assertSetEqual(
            self.position_filter.find_candidates(tokens, len(tokens), 0.8),
            set())

        # prefix index returns 2 candidates where as position index prunes them
        tokens = order_using_token_ordering(['aa', 'ef', 'lp'], token_ordering)
        self.assertSetEqual(
            self.position_filter.find_candidates(tokens, len(tokens), 0.8),
            set())

        # test empty list of probe tokens
        tokens = order_using_token_ordering([], token_ordering)
        self.assertSetEqual(
            self.position_filter.find_candidates(tokens, len(tokens), 0.8),
            set())
    def filter_pair(self, lstring, rstring):
        """Filter two strings with suffix filter.

        Args:
        lstring, rstring : input strings

        Returns:
        result : boolean, True if the tuple pair is dropped.
        """
        # check for empty string
        if (not lstring) or (not rstring):
            return True

        ltokens = tokenize(lstring, self.tokenizer, self.sim_measure_type)
        rtokens = tokenize(rstring, self.tokenizer, self.sim_measure_type)

        token_ordering = gen_token_ordering_for_lists([ltokens, rtokens])
        ordered_ltokens = order_using_token_ordering(ltokens, token_ordering)
        ordered_rtokens = order_using_token_ordering(rtokens, token_ordering)

        l_num_tokens = len(ordered_ltokens)
        r_num_tokens = len(ordered_rtokens)
        l_prefix_length = get_prefix_length(l_num_tokens,
                                            self.sim_measure_type,
                                            self.threshold, self.tokenizer)
        r_prefix_length = get_prefix_length(r_num_tokens,
                                            self.sim_measure_type,
                                            self.threshold, self.tokenizer)
        return self._filter_suffix(ordered_ltokens[l_prefix_length:],
                                   ordered_rtokens[r_prefix_length:],
                                   l_prefix_length, r_prefix_length,
                                   len(ltokens), len(rtokens))
Example #4
0
    def filter_pair(self, lstring, rstring):
        """Filter two strings with position filter.

        Args:
        lstring, rstring : input strings

        Returns:
        result : boolean, True if the tuple pair is dropped.
        """
        # check for empty string
        if (not lstring) or (not rstring):
            return True

        ltokens = tokenize(lstring, self.tokenizer, self.sim_measure_type)
        rtokens = tokenize(rstring, self.tokenizer, self.sim_measure_type)

        token_ordering = gen_token_ordering_for_lists([ltokens, rtokens])
        ordered_ltokens = order_using_token_ordering(ltokens, token_ordering)
        ordered_rtokens = order_using_token_ordering(rtokens, token_ordering)

        l_num_tokens = len(ordered_ltokens)
        r_num_tokens = len(ordered_rtokens)

        l_prefix_length = get_prefix_length(l_num_tokens,
                                            self.sim_measure_type,
                                            self.threshold,
                                            self.tokenizer) 
        r_prefix_length = get_prefix_length(r_num_tokens,
                                            self.sim_measure_type,
                                            self.threshold,
                                            self.tokenizer)
 
        l_prefix_dict = {}
        l_pos = 0
        for token in ordered_ltokens[0:l_prefix_length]:
            l_prefix_dict[token] = l_pos

        overlap_threshold = get_overlap_threshold(l_num_tokens, r_num_tokens,
                                                  self.sim_measure_type,
                                                  self.threshold,
                                                  self.tokenizer)
        current_overlap = 0
        r_pos = 0 
        for token in ordered_rtokens[0:r_prefix_length]:
            l_pos = l_prefix_dict.get(token)
            if l_pos is not None:
                overlap_upper_bound = 1 + min(l_num_tokens - l_pos - 1,
                                              r_num_tokens - r_pos - 1)
                if (current_overlap + overlap_upper_bound) < overlap_threshold:
                    return True
                current_overlap += 1
            r_pos += 1

        if current_overlap > 0:
            return False
        return True
    def filter_pair(self, lstring, rstring):
        """Checks if the input strings get dropped by the suffix filter.

        Args:
            lstring,rstring (string): input strings

        Returns:
            A flag indicating whether the string pair is dropped (boolean).
        """

        # If one of the inputs is missing, then check the allow_missing flag.
        # If it is set to True, then pass the pair. Else drop the pair.
        if pd.isnull(lstring) or pd.isnull(rstring):
            return (not self.allow_missing)

        # tokenize input strings
        ltokens = self.tokenizer.tokenize(lstring)
        rtokens = self.tokenizer.tokenize(rstring)    

        l_num_tokens = len(ltokens)
        r_num_tokens = len(rtokens)

        if l_num_tokens == 0 and r_num_tokens == 0:
            if self.sim_measure_type == 'OVERLAP':
                return True
            elif self.sim_measure_type == 'EDIT_DISTANCE':
                return False
            else:
                return (not self.allow_empty)

        # order the tokens using the token ordering
        token_ordering = gen_token_ordering_for_lists([ltokens, rtokens]) 
        ordered_ltokens = order_using_token_ordering(ltokens, token_ordering)
        ordered_rtokens = order_using_token_ordering(rtokens, token_ordering)

        # compute prefix length
        l_prefix_length = get_prefix_length(l_num_tokens,
                                            self.sim_measure_type,
                                            self.threshold,
                                            self.tokenizer)
        r_prefix_length = get_prefix_length(r_num_tokens,
                                            self.sim_measure_type,
                                            self.threshold,
                                            self.tokenizer)

        if l_prefix_length <= 0 or r_prefix_length <= 0:
            return True

        return self._filter_suffix(ordered_ltokens[l_prefix_length:],
                             ordered_rtokens[r_prefix_length:],
                             l_prefix_length,
                             r_prefix_length,
                             l_num_tokens, r_num_tokens)
Example #6
0
    def test_apply_filter(self):
        # prefix filter satisfies
        l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'],
                                              token_ordering)
        r_tokens = order_using_token_ordering(['fg', 'cd', 'aa'],
                                              token_ordering)
        self.assertTrue(
            self.prefix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens),
                                            len(r_tokens), 0.8))

        l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'],
                                              token_ordering)
        r_tokens = order_using_token_ordering(['aa'], token_ordering)
        self.assertTrue(
            self.prefix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens),
                                            len(r_tokens), 0.8))

        # prefix filter doesn't satisfy
        l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'],
                                              token_ordering)
        r_tokens = order_using_token_ordering(['fg'], token_ordering)
        self.assertFalse(
            self.prefix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens),
                                            len(r_tokens), 0.8))

        # test empty list of tokens
        l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'],
                                              token_ordering)
        r_tokens = order_using_token_ordering([], token_ordering)
        self.assertFalse(
            self.prefix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens),
                                            len(r_tokens), 0.8))
        self.assertFalse(
            self.prefix_filter.apply_filter(r_tokens, l_tokens, len(r_tokens),
                                            len(l_tokens), 0.8))
Example #7
0
    def build(self):
        for row in self.table:
            index_string = str(row[self.index_attr])
            # check for empty string
            if not index_string:
                continue
            index_attr_tokens = order_using_token_ordering(tokenize(
                                        index_string,
                                        self.tokenizer,
                                        self.sim_measure_type),
                                    self.token_ordering)
            num_tokens = len(index_attr_tokens)
            prefix_length = get_prefix_length(
                                num_tokens,
                                self.sim_measure_type, self.threshold,
                                self.tokenizer)
 
            row_id = row[self.key_attr]
            pos = 0
            for token in index_attr_tokens[0:prefix_length]:
                if self.index.get(token) is None:
                    self.index[token] = []
                self.index.get(token).append((row_id, pos))
                pos += 1

            self.size_map[row_id] = num_tokens

        return True
    def build(self, cache_empty_records=True):
        """Build prefix index."""                                             
        self.index = {}
        empty_records = []
        row_id = 0
        for row in self.table:
            # tokenize string and order the tokens using the token ordering
            index_string = row[self.index_attr]
            index_attr_tokens = order_using_token_ordering(
                self.tokenizer.tokenize(index_string), self.token_ordering)

            # compute prefix length
            num_tokens = len(index_attr_tokens)
            prefix_length = get_prefix_length(
                                num_tokens,
                                self.sim_measure_type, self.threshold,
                                self.tokenizer)
 
            # update index
            for token in index_attr_tokens[0:prefix_length]:
                if self.index.get(token) is None:
                    self.index[token] = []
                self.index.get(token).append(row_id)

            if cache_empty_records and num_tokens == 0:
                empty_records.append(row_id)

            row_id += 1

        return {'empty_records': empty_records}
Example #9
0
    def build(self, cache_empty_records=True):
        """Build prefix index."""
        self.index = {}
        empty_records = []
        row_id = 0
        for row in self.table:
            # tokenize string and order the tokens using the token ordering
            index_string = row[self.index_attr]
            index_attr_tokens = order_using_token_ordering(
                self.tokenizer.tokenize(index_string), self.token_ordering)

            # compute prefix length
            num_tokens = len(index_attr_tokens)
            prefix_length = get_prefix_length(num_tokens,
                                              self.sim_measure_type,
                                              self.threshold, self.tokenizer)

            # update index
            for token in index_attr_tokens[0:prefix_length]:
                if self.index.get(token) is None:
                    self.index[token] = []
                self.index.get(token).append(row_id)

            if cache_empty_records and num_tokens == 0:
                empty_records.append(row_id)

            row_id += 1

        return {'empty_records': empty_records}
Example #10
0
    def test_find_candidates(self):
        # test default case (presence of candidates)
        tokens = order_using_token_ordering(['aa', 'ef', 'lp'], token_ordering)
        self.assertSetEqual(
            self.prefix_filter.find_candidates(tokens, len(tokens), 0.8),
            set([0, 3]))

        # test empty set of candidates
        tokens = order_using_token_ordering(['op', 'lp', 'mp'], token_ordering)
        self.assertSetEqual(
            self.prefix_filter.find_candidates(tokens, len(tokens), 0.8),
            set())

        # test empty list of probe tokens
        tokens = order_using_token_ordering([], token_ordering)
        self.assertSetEqual(
            self.prefix_filter.find_candidates(tokens, len(tokens), 0.8),
            set())
Example #11
0
def jaccard_join_auto(ltable,
                      rtable,
                      l_id_attr,
                      l_join_attr,
                      r_id_attr,
                      r_join_attr,
                      threshold,
                      ltable_output_attrs=None,
                      rtable_output_attrs=None):
    matches_list = []
    sim_function = get_jaccard_fn()
    token_ordering = gen_token_ordering(ltable, l_join_attr)
    position_filter = PositionFilter(ltable,
                                     l_id_attr,
                                     l_join_attr,
                                     threshold,
                                     token_ordering,
                                     adaptive_prefix=True)
    position_filter.build_index()

    prog_bar = pyprind.ProgBar(len(rtable.index))

    l_row_dict = {}
    for idx, l_row in ltable.iterrows():
        l_id = l_row[l_id_attr]
        l_row_dict[l_id] = l_row

    r_row_dict = {}
    for idx, r_row in rtable.iterrows():
        r_id = r_row[r_id_attr]
        r_row_dict[r_id] = r_row

    for r_id in r_row_dict.keys():
        r_row = r_row_dict[r_id]
        r_tokens = order_using_token_ordering(list(r_row[r_join_attr]),
                                              token_ordering)
        r_num_tokens = len(r_tokens)

        l_cand_ids = position_filter.find_candidates(r_tokens, r_num_tokens,
                                                     threshold)
        for l_id in l_cand_ids:
            l_row = l_row_dict[l_id]
            if sim_function(l_row[l_join_attr],
                            r_row[r_join_attr]) >= threshold:
                match_dict = get_output_attributes(l_row, r_row, l_id_attr,
                                                   l_id, r_id_attr, r_id,
                                                   ltable_output_attrs,
                                                   rtable_output_attrs)
                matches_list.append(match_dict)
            #  matches_list.append(str(l_id)+','+str(r_id))
        prog_bar.update()

    output_matches = pd.DataFrame(matches_list)
    return output_matches
Example #12
0
    def build(self, cache_empty_records=True, cache_tokens=False):
        """Build position index."""
        self.index = {}
        self.size_cache = []
        cached_tokens = []
        empty_records = []
        row_id = 0
        for row in self.table:
            # tokenize string and order the tokens using the token ordering     
            index_string = row[self.index_attr]
            index_attr_tokens = order_using_token_ordering(
                self.tokenizer.tokenize(index_string), self.token_ordering)

            # compute prefix length
            num_tokens = len(index_attr_tokens)
            prefix_length = get_prefix_length(
                                num_tokens,
                                self.sim_measure_type, self.threshold,
                                self.tokenizer)

            # update the index
            pos = 0
            for token in index_attr_tokens[0:prefix_length]:
                if self.index.get(token) is None:
                    self.index[token] = []
                self.index.get(token).append((row_id, pos))
                pos += 1

            self.size_cache.append(num_tokens)

            # keep track of the max size and min size.
            if num_tokens < self.min_length:
                self.min_length = num_tokens

            if num_tokens > self.max_length:
                self.max_length = num_tokens

            # if cache_tokens flag is set to True, the store the tokens. 
            if cache_tokens:
                cached_tokens.append(index_attr_tokens)

            if cache_empty_records and num_tokens == 0:
                empty_records.append(row_id)

            row_id += 1            

        return {'cached_tokens' : cached_tokens,
                'empty_records' : empty_records}
Example #13
0
 def build_index(self):
     for row in self.table.itertuples():
         token_list = list(row[self.join_attr])
         ordered_token_list = order_using_token_ordering(
             token_list, self.token_ordering)
         num_tokens = len(ordered_token_list)
         prefix_length = int(num_tokens -
                             ceil(self.threshold * num_tokens) +
                             self.prefix_scheme)
         i = 0
         for token in ordered_token_list:
             if i == prefix_length:
                 break
             if self.prefix_index.get(token) is None:
                 self.prefix_index[token] = []
             self.prefix_index[token].append(row[self.id_attr])
Example #14
0
    def build_delta_indexes(self):
        self.delta_indexes.append({})
        self.delta_indexes.append({})
        num_delta_indexes = 1
        for idx, row in self.table.iterrows():
            id = row[self.id_attr]
            token_list = list(row[self.join_attr])
            ordered_token_list = order_using_token_ordering(
                token_list, self.token_ordering)
            num_tokens = len(ordered_token_list)
            if num_tokens < self.min_len:
                self.min_len = num_tokens
            if num_tokens > self.max_len:
                self.max_len = num_tokens
            self.size_map[id] = num_tokens
            t = ceil(self.threshold * num_tokens)
            one_prefix_length = int(num_tokens - t + 1)

            i = 0
            for i in xrange(0, one_prefix_length):
                if self.delta_indexes[1].get(ordered_token_list[i]) is None:
                    self.delta_indexes[1][ordered_token_list[i]] = []
                self.delta_indexes[1][ordered_token_list[i]].append((id, i))

            j = 2
            for i in xrange(one_prefix_length, num_tokens):
                if j > t:
                    break
                if num_delta_indexes < j:
                    self.delta_indexes.append({})
                    num_delta_indexes += 1
                if self.delta_indexes[j].get(ordered_token_list[i]) is None:
                    self.delta_indexes[j][ordered_token_list[i]] = []
                self.delta_indexes[j][ordered_token_list[i]].append((id, i))
                j += 1

        self.avg_len = (self.min_len + self.max_len) / 2
        return True
Example #15
0
    def build_index(self):
        if self.adaptive_prefix:
            return self.build_delta_indexes()

        for idx, row in self.table.iterrows():
            id = row[self.id_attr]
            token_list = list(row[self.join_attr])
            ordered_token_list = order_using_token_ordering(
                token_list, self.token_ordering)
            num_tokens = len(ordered_token_list)
            self.size_map[id] = num_tokens
            prefix_length = int(num_tokens -
                                ceil(self.threshold * num_tokens) +
                                self.prefix_scheme)
            i = 0
            for token in ordered_token_list:
                if i == prefix_length:
                    break
                if self.position_index.get(token) is None:
                    self.position_index[token] = []
                self.position_index[token].append((id, i))

        return True
def _edit_distance_join_split(ltable_list, rtable_list,
                              l_columns, r_columns,
                              l_key_attr, r_key_attr,
                              l_join_attr, r_join_attr,
                              tokenizer, threshold, comp_op,
                              l_out_attrs, r_out_attrs,
                              l_out_prefix, r_out_prefix,
                              out_sim_score, show_progress):
    """Perform edit distance join for a split of ltable and rtable"""
    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    sim_measure_type = 'EDIT_DISTANCE'
    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
                         [ltable_list, rtable_list],
                         [l_join_attr_index, r_join_attr_index],
                         tokenizer, sim_measure_type)

    # cache l_join_attr lengths
    l_join_attr_list = []
    for row in ltable_list:
        l_join_attr_list.append(len(row[l_join_attr_index]))

    # Build prefix index on l_join_attr
    prefix_index = PrefixIndex(ltable_list, l_join_attr_index,
                               tokenizer, sim_measure_type, threshold,
                               token_ordering)
    prefix_index.build(False)

    prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold)

    comp_fn = COMP_OP_MAP[comp_op]
    sim_fn = get_sim_function(sim_measure_type)

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable_list))

    for r_row in rtable_list:
        r_string = r_row[r_join_attr_index]
        r_len = len(r_string)

        r_ordered_tokens = order_using_token_ordering(
                tokenizer.tokenize(r_string), token_ordering)

        # obtain candidates by applying prefix filter. 
        candidates = prefix_filter.find_candidates(r_ordered_tokens,
                                                   prefix_index)

        for cand in candidates:
            if r_len - threshold <= l_join_attr_list[cand] <= r_len + threshold:
                l_row = ltable_list[cand]

                # compute the actual edit distance                           
                edit_dist = sim_fn(l_row[l_join_attr_index], r_string)

                if comp_fn(edit_dist, threshold):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                                         l_row, r_row,
                                         l_key_attr_index, r_key_attr_index,
                                         l_out_attrs_indices,
                                         r_out_attrs_indices)
                    else:
                        output_row = [l_row[l_key_attr_index],
                                      r_row[r_key_attr_index]]

                    # if out_sim_score flag is set, append the edit distance 
                    # score to the output record.
                    if out_sim_score:
                        output_row.append(edit_dist)

                    output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
Example #17
0
    def filter_pair(self, lstring, rstring):
        """Checks if the input strings get dropped by the position filter.

        Args:
            lstring,rstring (string): input strings

        Returns:
            A flag indicating whether the string pair is dropped (boolean).
        """

        # If one of the inputs is missing, then check the allow_missing flag.
        # If it is set to True, then pass the pair. Else drop the pair.
        if pd.isnull(lstring) or pd.isnull(rstring):
            return (not self.allow_missing)

        # tokenize input strings     
        ltokens = self.tokenizer.tokenize(lstring)
        rtokens = self.tokenizer.tokenize(rstring)

        l_num_tokens = len(ltokens)
        r_num_tokens = len(rtokens)

        if l_num_tokens == 0 and r_num_tokens == 0:
            if self.sim_measure_type == 'OVERLAP':
                return True
            elif self.sim_measure_type == 'EDIT_DISTANCE':
                return False
            else:
                return (not self.allow_empty)

        token_ordering = gen_token_ordering_for_lists([ltokens, rtokens])
        ordered_ltokens = order_using_token_ordering(ltokens, token_ordering)
        ordered_rtokens = order_using_token_ordering(rtokens, token_ordering)

        l_prefix_length = get_prefix_length(l_num_tokens,
                                            self.sim_measure_type,
                                            self.threshold,
                                            self.tokenizer) 
        r_prefix_length = get_prefix_length(r_num_tokens,
                                            self.sim_measure_type,
                                            self.threshold,
                                            self.tokenizer)

        if l_prefix_length <= 0 or r_prefix_length <= 0:
            return True
 
        l_prefix_dict = {}
        l_pos = 0
        for token in ordered_ltokens[0:l_prefix_length]:
            l_prefix_dict[token] = l_pos

        overlap_threshold = get_overlap_threshold(l_num_tokens, r_num_tokens,
                                                  self.sim_measure_type,
                                                  self.threshold,
                                                  self.tokenizer)
        current_overlap = 0
        r_pos = 0 
        for token in ordered_rtokens[0:r_prefix_length]:
            l_pos = l_prefix_dict.get(token)
            if l_pos is not None:
                overlap_upper_bound = 1 + min(l_num_tokens - l_pos - 1,
                                              r_num_tokens - r_pos - 1)
                if (current_overlap + overlap_upper_bound) < overlap_threshold:
                    return True
                current_overlap += 1
            r_pos += 1

        if current_overlap > 0:
            return False
        return True
Example #18
0
def _edit_distance_join_split(ltable_list, rtable_list, l_columns, r_columns,
                              l_key_attr, r_key_attr, l_join_attr, r_join_attr,
                              tokenizer, threshold, comp_op, l_out_attrs,
                              r_out_attrs, l_out_prefix, r_out_prefix,
                              out_sim_score, show_progress):
    """Perform edit distance join for a split of ltable and rtable"""
    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    sim_measure_type = 'EDIT_DISTANCE'
    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
        [ltable_list, rtable_list], [l_join_attr_index, r_join_attr_index],
        tokenizer, sim_measure_type)

    # cache l_join_attr lengths
    l_join_attr_list = []
    for row in ltable_list:
        l_join_attr_list.append(len(row[l_join_attr_index]))

    # Build prefix index on l_join_attr
    prefix_index = PrefixIndex(ltable_list, l_join_attr_index, tokenizer,
                               sim_measure_type, threshold, token_ordering)
    prefix_index.build(False)

    prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold)

    comp_fn = COMP_OP_MAP[comp_op]
    sim_fn = get_sim_function(sim_measure_type)

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable_list))

    for r_row in rtable_list:
        r_string = r_row[r_join_attr_index]
        r_len = len(r_string)

        r_ordered_tokens = order_using_token_ordering(
            tokenizer.tokenize(r_string), token_ordering)

        # obtain candidates by applying prefix filter.
        candidates = prefix_filter.find_candidates(r_ordered_tokens,
                                                   prefix_index)

        for cand in candidates:
            if r_len - threshold <= l_join_attr_list[cand] <= r_len + threshold:
                l_row = ltable_list[cand]

                # compute the actual edit distance
                edit_dist = sim_fn(l_row[l_join_attr_index], r_string)

                if comp_fn(edit_dist, threshold):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                            l_row, r_row, l_key_attr_index, r_key_attr_index,
                            l_out_attrs_indices, r_out_attrs_indices)
                    else:
                        output_row = [
                            l_row[l_key_attr_index], r_row[r_key_attr_index]
                        ]

                    # if out_sim_score flag is set, append the edit distance
                    # score to the output record.
                    if out_sim_score:
                        output_row.append(edit_dist)

                    output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
    def filter_tables(self,
                      ltable,
                      rtable,
                      l_key_attr,
                      r_key_attr,
                      l_filter_attr,
                      r_filter_attr,
                      l_out_attrs=None,
                      r_out_attrs=None,
                      l_out_prefix='l_',
                      r_out_prefix='r_'):
        """Filter tables with suffix filter.

        Args:
        ltable, rtable : Pandas data frame
        l_key_attr, r_key_attr : String, key attribute from ltable and rtable
        l_filter_attr, r_filter_attr : String, filter attribute from ltable and rtable
        l_out_attrs, r_out_attrs : list of attribtues to be included in the output table from ltable and rtable
        l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table 

        Returns:
        result : Pandas data frame
        """
        # check if the input tables are dataframes
        validate_input_table(ltable, 'left table')
        validate_input_table(rtable, 'right table')

        # check if the key attributes and filter attributes exist
        validate_attr(l_key_attr, ltable.columns, 'key attribute',
                      'left table')
        validate_attr(r_key_attr, rtable.columns, 'key attribute',
                      'right table')
        validate_attr(l_filter_attr, ltable.columns, 'filter attribute',
                      'left table')
        validate_attr(r_filter_attr, rtable.columns, 'filter attribute',
                      'right table')

        # check if the output attributes exist
        validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs,
                              rtable.columns)

        # check if the key attributes are unique and do not contain missing values
        validate_key_attr(l_key_attr, ltable, 'left table')
        validate_key_attr(r_key_attr, rtable, 'right table')

        # find column indices of key attr, filter attr and
        # output attrs in ltable
        l_columns = list(ltable.columns.values)
        l_key_attr_index = l_columns.index(l_key_attr)
        l_filter_attr_index = l_columns.index(l_filter_attr)
        l_out_attrs_indices = find_output_attribute_indices(
            l_columns, l_out_attrs)

        # find column indices of key attr, filter attr and
        # output attrs in rtable
        r_columns = list(rtable.columns.values)
        r_key_attr_index = r_columns.index(r_key_attr)
        r_filter_attr_index = r_columns.index(r_filter_attr)
        r_out_attrs_indices = find_output_attribute_indices(
            r_columns, r_out_attrs)

        # build a dictionary on ltable
        ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                            l_filter_attr_index)

        # build a dictionary on rtable
        rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                            r_filter_attr_index)

        # generate token ordering using tokens in l_filter_attr
        # and r_filter_attr
        token_ordering = gen_token_ordering_for_tables(
            [ltable_dict.values(), rtable_dict.values()],
            [l_filter_attr_index, r_filter_attr_index], self.tokenizer,
            self.sim_measure_type)

        output_rows = []
        has_output_attributes = (l_out_attrs is not None
                                 or r_out_attrs is not None)
        prog_bar = pyprind.ProgBar(len(ltable))

        for l_row in ltable_dict.values():
            l_id = l_row[l_key_attr_index]
            l_string = str(l_row[l_filter_attr_index])
            # check for empty string
            if not l_string:
                continue
            ltokens = tokenize(l_string, self.tokenizer, self.sim_measure_type)
            ordered_ltokens = order_using_token_ordering(
                ltokens, token_ordering)
            l_num_tokens = len(ordered_ltokens)
            l_prefix_length = get_prefix_length(l_num_tokens,
                                                self.sim_measure_type,
                                                self.threshold, self.tokenizer)
            l_suffix = ordered_ltokens[l_prefix_length:]
            for r_row in rtable_dict.values():
                r_id = r_row[r_key_attr_index]
                r_string = str(r_row[r_filter_attr_index])
                # check for empty string
                if not r_string:
                    continue
                rtokens = tokenize(r_string, self.tokenizer,
                                   self.sim_measure_type)
                ordered_rtokens = order_using_token_ordering(
                    rtokens, token_ordering)
                r_num_tokens = len(ordered_rtokens)
                r_prefix_length = get_prefix_length(r_num_tokens,
                                                    self.sim_measure_type,
                                                    self.threshold,
                                                    self.tokenizer)
                if not self._filter_suffix(
                        l_suffix, ordered_rtokens[r_prefix_length:],
                        l_prefix_length, r_prefix_length, l_num_tokens,
                        r_num_tokens):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                            ltable_dict[l_id], r_row, l_id, r_id,
                            l_out_attrs_indices, r_out_attrs_indices)
                        output_rows.append(output_row)
                    else:
                        output_rows.append([l_id, r_id])

            prog_bar.update()

        output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                      l_out_attrs, r_out_attrs,
                                                      l_out_prefix,
                                                      r_out_prefix)

        # generate a dataframe from the list of output rows
        output_table = pd.DataFrame(output_rows, columns=output_header)
        output_table.insert(0, '_id', range(0, len(output_table)))
        return output_table
Example #20
0
def jaccard_join(ltable,
                 rtable,
                 l_id_attr,
                 l_join_attr,
                 r_id_attr,
                 r_join_attr,
                 threshold,
                 ltable_output_attrs=None,
                 rtable_output_attrs=None,
                 filters=[]):

    if len(filters) == 0:
        return jaccard_join_auto(ltable, rtable, l_id_attr, l_join_attr,
                                 r_id_attr, r_join_attr, threshold,
                                 ltable_output_attrs, rtable_output_attrs)

    matches_list = []
    sim_function = get_jaccard_fn()

    (index_filters, non_index_filters, token_ordering,
     need_ordering) = analyze_filters(filters)

    l_join_attr_token_dict = {}
    prog_bar = pyprind.ProgBar(len(rtable.index))

    l_row_dict = {}
    for idx, l_row in ltable.iterrows():
        l_id = l_row[l_id_attr]
        l_row_dict[l_id] = l_row
        l_join_attr_token_dict[l_id] = order_using_token_ordering(
            list(l_row[l_join_attr]), token_ordering)

    r_row_dict = {}
    for idx, r_row in rtable.iterrows():
        r_id = r_row[r_id_attr]
        r_row_dict[r_id] = r_row

    for r_id in r_row_dict.keys():
        r_row = r_row_dict[r_id]
        r_tokens = order_using_token_ordering(list(r_row[r_join_attr]),
                                              token_ordering)
        r_num_tokens = len(r_tokens)

        l_cand_ids = apply_index_filters(r_tokens, r_num_tokens, threshold,
                                         index_filters)
        for l_id in l_cand_ids:

            l_row = l_row_dict[l_id]
            l_tokens = l_join_attr_token_dict[l_id]
            if apply_non_index_filters(l_tokens, r_tokens, len(l_tokens),
                                       r_num_tokens, threshold,
                                       non_index_filters):
                if sim_function(l_row[l_join_attr],
                                r_row[r_join_attr]) >= threshold:
                    match_dict = get_output_attributes(l_row, r_row, l_id_attr,
                                                       l_id, r_id_attr, r_id,
                                                       ltable_output_attrs,
                                                       rtable_output_attrs)
                    matches_list.append(match_dict)
        prog_bar.update()

    output_matches = pd.DataFrame(matches_list)
    return output_matches
def _filter_tables_split(ltable, rtable,
                         l_columns, r_columns,
                         l_key_attr, r_key_attr,
                         l_filter_attr, r_filter_attr,
                         suffix_filter,
                         l_out_attrs, r_out_attrs,
                         l_out_prefix, r_out_prefix, show_progress):
    # find column indices of key attr, filter attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_filter_attr_index = l_columns.index(l_filter_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, filter attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_filter_attr_index = r_columns.index(r_filter_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)
        
    # generate token ordering using tokens in l_filter_attr and r_filter_attr
    token_ordering = gen_token_ordering_for_tables(
                                [ltable, rtable],
                                [l_filter_attr_index, r_filter_attr_index],
                                suffix_filter.tokenizer,
                                suffix_filter.sim_measure_type)

    # ignore allow_empty flag for OVERLAP and EDIT_DISTANCE measures.           
    handle_empty = (suffix_filter.allow_empty and
        suffix_filter.sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE'])

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(ltable))

    for l_row in ltable:
        l_string = l_row[l_filter_attr_index]

        ltokens = suffix_filter.tokenizer.tokenize(l_string)
        ordered_ltokens = order_using_token_ordering(ltokens, token_ordering)
        l_num_tokens = len(ordered_ltokens)
        l_prefix_length = get_prefix_length(l_num_tokens,
                                            suffix_filter.sim_measure_type,
                                            suffix_filter.threshold,
                                            suffix_filter.tokenizer)

        l_suffix = ordered_ltokens[l_prefix_length:]
        for r_row in rtable:
            r_string = r_row[r_filter_attr_index]

            rtokens = suffix_filter.tokenizer.tokenize(r_string)
            ordered_rtokens = order_using_token_ordering(rtokens,
                                                         token_ordering)
            r_num_tokens = len(ordered_rtokens)

            # If allow_empty flag is set, then add the pair to the output.
            if handle_empty and l_num_tokens == 0 and r_num_tokens == 0:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                         l_row, r_row,
                                         l_key_attr_index, r_key_attr_index,
                                         l_out_attrs_indices,
                                         r_out_attrs_indices)
                else:
                    output_row = [l_row[l_key_attr_index],
                                  r_row[r_key_attr_index]]

                output_rows.append(output_row)
                continue

            r_prefix_length = get_prefix_length(r_num_tokens,
                                                suffix_filter.sim_measure_type,
                                                suffix_filter.threshold,
                                                suffix_filter.tokenizer)

            if l_prefix_length <= 0 or r_prefix_length <= 0:
                continue

            if not suffix_filter._filter_suffix(l_suffix,
                                           ordered_rtokens[r_prefix_length:],
                                           l_prefix_length, r_prefix_length,
                                           l_num_tokens, r_num_tokens):
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                         l_row, r_row,
                                         l_key_attr_index, r_key_attr_index, 
                                         l_out_attrs_indices,
                                         r_out_attrs_indices)
                else:
                    output_row = [l_row[l_key_attr_index],
                                  r_row[r_key_attr_index]]

                output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(
                            l_key_attr, r_key_attr,
                            l_out_attrs, r_out_attrs, 
                            l_out_prefix, r_out_prefix)

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
Example #22
0
def _edit_dist_join_split(ltable, rtable,
                          l_key_attr, r_key_attr,
                          l_join_attr, r_join_attr,
                          tokenizer,
                          threshold,
                          l_out_attrs, r_out_attrs,
                          l_out_prefix, r_out_prefix,
                          out_sim_score):
    # find column indices of key attr, join attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_join_attr_index)

    # build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_join_attr_index)

    sim_measure_type = 'EDIT_DISTANCE'
    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
                         [ltable_dict.values(),
                          rtable_dict.values()],
                         [l_join_attr_index,
                          r_join_attr_index],
                         tokenizer, sim_measure_type)

    # build a dictionary of l_join_attr lengths
    l_join_attr_dict = {}
    for row in ltable_dict.values():
        l_join_attr_dict[row[l_key_attr_index]] = len(str(
                                                      row[l_join_attr_index]))

    # Build prefix index on l_join_attr
    prefix_index = PrefixIndex(ltable_dict.values(),
                               l_key_attr_index, l_join_attr_index,
                               tokenizer, sim_measure_type, threshold,
                               token_ordering)
    prefix_index.build()

    prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold)
    sim_fn = get_sim_function(sim_measure_type)
    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)
    prog_bar = pyprind.ProgBar(len(rtable_dict.keys()))

    for r_row in rtable_dict.values():
        r_id = r_row[r_key_attr_index]
        r_string = str(r_row[r_join_attr_index])
        r_len = len(r_string)
        # check for empty string
        if not r_string:
            continue
        r_join_attr_tokens = tokenize(r_string, tokenizer, sim_measure_type)
        r_ordered_tokens = order_using_token_ordering(r_join_attr_tokens,
                                                      token_ordering)
        candidates = find_candidates_prefix_filter(
                         r_ordered_tokens, len(r_ordered_tokens),
                         prefix_filter, prefix_index) 
        for cand in candidates:
            if r_len - threshold <= l_join_attr_dict[cand] <= r_len + threshold:
                edit_dist = sim_fn(str(ltable_dict[cand][l_join_attr_index]),
                                   r_string)
                if edit_dist <= threshold:
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                                         ltable_dict[cand], r_row,
                                         cand, r_id,
                                         l_out_attrs_indices,
                                         r_out_attrs_indices)
                        if out_sim_score:
                            output_row.append(edit_dist)
                        output_rows.append(output_row)
                    else:
                        output_row = [cand, r_id]
                        if out_sim_score:
                            output_row.append(edit_dist)
                        output_rows.append(output_row)

        prog_bar.update()

    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
def _filter_tables_split(ltable, rtable,
                         l_key_attr, r_key_attr,
                         l_filter_attr, r_filter_attr,
                         prefix_filter,
                         l_out_attrs, r_out_attrs,
                         l_out_prefix, r_out_prefix):
    # find column indices of key attr, filter attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_filter_attr_index = l_columns.index(l_filter_attr)
    l_out_attrs_indices = []
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, filter attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_filter_attr_index = r_columns.index(r_filter_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_filter_attr_index)

    # build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_filter_attr_index)

    # generate token ordering using tokens in l_filter_attr and r_filter_attr
    token_ordering = gen_token_ordering_for_tables(
                                 [ltable_dict.values(), rtable_dict.values()],
                                 [l_filter_attr_index, r_filter_attr_index],
                                 prefix_filter.tokenizer,
                                 prefix_filter.sim_measure_type)

    # Build prefix index on l_filter_attr
    prefix_index = PrefixIndex(ltable_dict.values(), l_key_attr_index,
                               l_filter_attr_index, prefix_filter.tokenizer,
                               prefix_filter.sim_measure_type,
                               prefix_filter.threshold, token_ordering)
    prefix_index.build()

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)
    prog_bar = pyprind.ProgBar(len(rtable))

    for r_row in rtable_dict.values():
        r_id = r_row[r_key_attr_index]
        r_string = str(r_row[r_filter_attr_index])
        # check for empty string
        if not r_string:
            continue
        r_filter_attr_tokens = tokenize(r_string,
                                        prefix_filter.tokenizer,
                                        prefix_filter.sim_measure_type)
        r_ordered_tokens = order_using_token_ordering(r_filter_attr_tokens,
                                                      token_ordering)
           
        # probe prefix index and find candidates
        candidates = _find_candidates(r_ordered_tokens, len(r_ordered_tokens),
                                      prefix_filter, prefix_index)

        for cand in candidates:
            if has_output_attributes:
                output_row = get_output_row_from_tables(
                                     ltable_dict[cand], r_row,
                                     cand, r_id, 
                                     l_out_attrs_indices, r_out_attrs_indices)
                output_rows.append(output_row)
            else:
                output_rows.append([cand, r_id])
 
        prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs, 
                                                  l_out_prefix, r_out_prefix)

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
def set_sim_join(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr,
                 l_join_attr, r_join_attr, tokenizer, sim_measure_type,
                 threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs,
                 l_out_prefix, r_out_prefix, out_sim_score, show_progress):
    """Perform set similarity join for a split of ltable and rtable"""

    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
        [ltable, rtable], [l_join_attr_index, r_join_attr_index], tokenizer,
        sim_measure_type)

    # Build position index on l_join_attr
    position_index = PositionIndex(ltable, l_join_attr_index, tokenizer,
                                   sim_measure_type, threshold, token_ordering)
    # While building the index, we cache the tokens and the empty records.
    # We cache the tokens so that we need not tokenize each string in
    # l_join_attr multiple times when we need to compute the similarity measure.
    # Further we cache the empty record ids to handle the allow_empty flag.
    cached_data = position_index.build(allow_empty, cache_tokens=True)
    l_empty_records = cached_data['empty_records']
    cached_l_tokens = cached_data['cached_tokens']

    pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold)

    sim_fn = get_sim_function(sim_measure_type)
    comp_fn = COMP_OP_MAP[comp_op]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable))
    k = 0
    for r_row in rtable:
        r_string = r_row[r_join_attr_index]

        # order the tokens using the token ordering.
        r_ordered_tokens = order_using_token_ordering(
            tokenizer.tokenize(r_string), token_ordering)

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the join attribute, then generate output pairs joining
        # the current rtable record with those records in ltable with empty set
        # of tokens in the join attribute. These ltable record ids are cached in
        # l_empty_records list which was constructed when building the position
        # index.
        if allow_empty and len(r_ordered_tokens) == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                        ltable[l_id], r_row, l_key_attr_index,
                        r_key_attr_index, l_out_attrs_indices,
                        r_out_attrs_indices)
                else:
                    output_row = [
                        ltable[l_id][l_key_attr_index], r_row[r_key_attr_index]
                    ]

                if out_sim_score:
                    output_row.append(1.0)
                output_rows.append(output_row)
            continue

        # obtain candidates by applying position filter.
        candidate_overlap = pos_filter.find_candidates(r_ordered_tokens,
                                                       position_index)

        for cand, overlap in iteritems(candidate_overlap):
            if overlap > 0:
                l_ordered_tokens = cached_l_tokens[cand]
                k += 1
                # compute the actual similarity score
                sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens)

                if comp_fn(sim_score, threshold):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                            ltable[cand], r_row, l_key_attr_index,
                            r_key_attr_index, l_out_attrs_indices,
                            r_out_attrs_indices)
                    else:
                        output_row = [
                            ltable[cand][l_key_attr_index],
                            r_row[r_key_attr_index]
                        ]

                    # if out_sim_score flag is set, append the similarity score
                    # to the output record.
                    if out_sim_score:
                        output_row.append(sim_score)

                    output_rows.append(output_row)

        if show_progress:
            prog_bar.update()
    print 'k : ', k
    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
Example #25
0
def set_sim_join(ltable, rtable,
                 l_columns, r_columns,
                 l_key_attr, r_key_attr,
                 l_join_attr, r_join_attr,
                 tokenizer, sim_measure_type, threshold, comp_op,
                 allow_empty,
                 l_out_attrs, r_out_attrs,
                 l_out_prefix, r_out_prefix,
                 out_sim_score, show_progress):
    """Perform set similarity join for a split of ltable and rtable"""

    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
                         [ltable, rtable],
                         [l_join_attr_index, r_join_attr_index],
                         tokenizer, sim_measure_type)

    # Build position index on l_join_attr
    position_index = PositionIndex(ltable, l_join_attr_index,
                                   tokenizer, sim_measure_type,
                                   threshold, token_ordering)
    # While building the index, we cache the tokens and the empty records.
    # We cache the tokens so that we need not tokenize each string in 
    # l_join_attr multiple times when we need to compute the similarity measure.
    # Further we cache the empty record ids to handle the allow_empty flag.
    cached_data = position_index.build(allow_empty, cache_tokens=True)
    l_empty_records = cached_data['empty_records']
    cached_l_tokens = cached_data['cached_tokens']

    pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold)

    sim_fn = get_sim_function(sim_measure_type)
    comp_fn = COMP_OP_MAP[comp_op]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable))
    k = 0
    for r_row in rtable:
        r_string = r_row[r_join_attr_index]

        # order the tokens using the token ordering.
        r_ordered_tokens = order_using_token_ordering(
                tokenizer.tokenize(r_string), token_ordering)

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the join attribute, then generate output pairs joining 
        # the current rtable record with those records in ltable with empty set 
        # of tokens in the join attribute. These ltable record ids are cached in
        # l_empty_records list which was constructed when building the position
        # index.
        if allow_empty and len(r_ordered_tokens) == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                     ltable[l_id], r_row,
                                     l_key_attr_index, r_key_attr_index,
                                     l_out_attrs_indices,
                                     r_out_attrs_indices)
                else:
                    output_row = [ltable[l_id][l_key_attr_index],
                                  r_row[r_key_attr_index]]

                if out_sim_score:
                    output_row.append(1.0)
                output_rows.append(output_row)
            continue

        # obtain candidates by applying position filter.            
        candidate_overlap = pos_filter.find_candidates(r_ordered_tokens,
                                                       position_index)

        for cand, overlap in iteritems(candidate_overlap):
            if overlap > 0:
                l_ordered_tokens = cached_l_tokens[cand]
                k += 1
                # compute the actual similarity score
                sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens)

                if comp_fn(sim_score, threshold):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                                         ltable[cand], r_row,
                                         l_key_attr_index, r_key_attr_index,
                                         l_out_attrs_indices,
                                         r_out_attrs_indices)
                    else:
                        output_row = [ltable[cand][l_key_attr_index],
                                      r_row[r_key_attr_index]]

                    # if out_sim_score flag is set, append the similarity score    
                    # to the output record.  
                    if out_sim_score:
                        output_row.append(sim_score)

                    output_rows.append(output_row)

        if show_progress:
            prog_bar.update()
    print 'k : ', k
    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
Example #26
0
def _set_sim_join_split(ltable, rtable,
                        l_key_attr, r_key_attr,
                        l_join_attr, r_join_attr,
                        tokenizer,
                        sim_measure_type,
                        threshold,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix,
                        out_sim_score):
    """Perform set similarity join for a split of ltable and rtable"""

    # find column indices of key attr, join attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_join_attr_index)

    # build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_join_attr_index)

    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
                         [ltable_dict.values(),
                          rtable_dict.values()],
                         [l_join_attr_index,
                          r_join_attr_index],
                         tokenizer, sim_measure_type)

    # build a dictionary of tokenized l_join_attr
    l_join_attr_dict = {}
    for row in ltable_dict.values():
        l_join_attr_dict[row[l_key_attr_index]] = order_using_token_ordering(
            tokenize(str(row[l_join_attr_index]), tokenizer, sim_measure_type),
                                                  token_ordering)

    # Build position index on l_join_attr
    position_index = PositionIndex(ltable_dict.values(),
                                   l_key_attr_index, l_join_attr_index,
                                   tokenizer, sim_measure_type,
                                   threshold, token_ordering)
    position_index.build()

    pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold)
    suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold)
    sim_fn = get_sim_function(sim_measure_type)
    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)
    prog_bar = pyprind.ProgBar(len(rtable_dict.keys()))

    for r_row in rtable_dict.values():
        r_id = r_row[r_key_attr_index]
        r_string = str(r_row[r_join_attr_index])
        # check for empty string
        if not r_string:
            continue
        r_join_attr_tokens = tokenize(r_string, tokenizer, sim_measure_type)
        r_ordered_tokens = order_using_token_ordering(r_join_attr_tokens,
                                                      token_ordering)
        r_num_tokens = len(r_ordered_tokens)
        r_prefix_length = get_prefix_length(r_num_tokens,
                                            sim_measure_type,
                                            threshold, tokenizer)     

        candidate_overlap = find_candidates_position_filter(
                                r_ordered_tokens, r_num_tokens, r_prefix_length,
                                pos_filter, position_index)
        for cand, overlap in iteritems(candidate_overlap):
            if overlap > 0:
                l_ordered_tokens = l_join_attr_dict[cand]
                l_num_tokens = position_index.get_size(cand)
                l_prefix_length = get_prefix_length(
                                      l_num_tokens,
                                      sim_measure_type,
                                      threshold, tokenizer)
                if not suffix_filter._filter_suffix(
                           l_ordered_tokens[l_prefix_length:],
                           r_ordered_tokens[r_prefix_length:],
                           l_prefix_length,
                           r_prefix_length,
                           l_num_tokens, r_num_tokens):
                    sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens)
                    if sim_score >= threshold:
                        if has_output_attributes:
                            output_row = get_output_row_from_tables(
                                             ltable_dict[cand], r_row,
                                             cand, r_id,
                                             l_out_attrs_indices,
                                             r_out_attrs_indices)
                            if out_sim_score:
                                output_row.append(sim_score)
                            output_rows.append(output_row)
                        else:
                            output_row = [cand, r_id]
                            if out_sim_score:
                                output_row.append(sim_score)
                            output_rows.append(output_row)
        prog_bar.update()

    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
Example #27
0
def _filter_tables_split(ltable, rtable,
                         l_columns, r_columns,
                         l_key_attr, r_key_attr,
                         l_filter_attr, r_filter_attr,
                         position_filter,
                         l_out_attrs, r_out_attrs,
                         l_out_prefix, r_out_prefix, show_progress):
    # find column indices of key attr, filter attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_filter_attr_index = l_columns.index(l_filter_attr)
    l_out_attrs_indices = []
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, filter attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_filter_attr_index = r_columns.index(r_filter_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # generate token ordering using tokens in l_filter_attr and r_filter_attr
    token_ordering = gen_token_ordering_for_tables(
                                 [ltable, rtable],
                                 [l_filter_attr_index, r_filter_attr_index],
                                 position_filter.tokenizer,
                                 position_filter.sim_measure_type)

    # ignore allow_empty flag for OVERLAP and EDIT_DISTANCE measures.           
    handle_empty = (position_filter.allow_empty and
        position_filter.sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE'])

    # Build position index on l_filter_attr
    position_index = PositionIndex(ltable, l_filter_attr_index,
                                   position_filter.tokenizer,
                                   position_filter.sim_measure_type,
                                   position_filter.threshold, token_ordering)
    # While building the index, we cache the record ids with empty set of 
    # tokens. This is needed to handle the allow_empty flag.
    cached_data = position_index.build(handle_empty)
    l_empty_records = cached_data['empty_records']

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable))

    for r_row in rtable:
        r_string = r_row[r_filter_attr_index]

        r_filter_attr_tokens = position_filter.tokenizer.tokenize(r_string)
        r_ordered_tokens = order_using_token_ordering(r_filter_attr_tokens,
                                                      token_ordering)

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the filter attribute, then generate output pairs joining   
        # the current rtable record with those records in ltable with empty set 
        # of tokens in the filter attribute. These ltable record ids are cached 
        # in l_empty_records list which was constructed when building the 
        # position index. 
        if handle_empty and len(r_ordered_tokens) == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                     ltable[l_id], r_row,
                                     l_key_attr_index, r_key_attr_index,
                                     l_out_attrs_indices,
                                     r_out_attrs_indices)
                else:
                    output_row = [ltable[l_id][l_key_attr_index],
                                  r_row[r_key_attr_index]]

                output_rows.append(output_row)
            continue

        candidate_overlap = position_filter.find_candidates(
                                r_ordered_tokens, position_index)

        for cand, overlap in iteritems(candidate_overlap):
            if overlap > 0:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                     ltable[cand], r_row,
                                     l_key_attr_index, r_key_attr_index, 
                                     l_out_attrs_indices, r_out_attrs_indices)
                else:
                    output_row = [ltable[cand][l_key_attr_index],
                                  r_row[r_key_attr_index]]

                output_rows.append(output_row)

        if show_progress:                    
            prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs, 
                                                  l_out_prefix, r_out_prefix)

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table