Exemple #1
0
def _edit_distance_join_split(ltable_list, rtable_list, l_columns, r_columns,
                              l_key_attr, r_key_attr, l_join_attr, r_join_attr,
                              tokenizer, threshold, comp_op, l_out_attrs,
                              r_out_attrs, l_out_prefix, r_out_prefix,
                              out_sim_score, show_progress):
    """Perform edit distance join for a split of ltable and rtable"""
    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    sim_measure_type = 'EDIT_DISTANCE'
    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
        [ltable_list, rtable_list], [l_join_attr_index, r_join_attr_index],
        tokenizer, sim_measure_type)

    # cache l_join_attr lengths
    l_join_attr_list = []
    for row in ltable_list:
        l_join_attr_list.append(len(row[l_join_attr_index]))

    # Build prefix index on l_join_attr
    prefix_index = PrefixIndex(ltable_list, l_join_attr_index, tokenizer,
                               sim_measure_type, threshold, token_ordering)
    prefix_index.build(False)

    prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold)

    comp_fn = COMP_OP_MAP[comp_op]
    sim_fn = get_sim_function(sim_measure_type)

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable_list))

    for r_row in rtable_list:
        r_string = r_row[r_join_attr_index]
        r_len = len(r_string)

        r_ordered_tokens = order_using_token_ordering(
            tokenizer.tokenize(r_string), token_ordering)

        # obtain candidates by applying prefix filter.
        candidates = prefix_filter.find_candidates(r_ordered_tokens,
                                                   prefix_index)

        for cand in candidates:
            if r_len - threshold <= l_join_attr_list[cand] <= r_len + threshold:
                l_row = ltable_list[cand]

                # compute the actual edit distance
                edit_dist = sim_fn(l_row[l_join_attr_index], r_string)

                if comp_fn(edit_dist, threshold):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                            l_row, r_row, l_key_attr_index, r_key_attr_index,
                            l_out_attrs_indices, r_out_attrs_indices)
                    else:
                        output_row = [
                            l_row[l_key_attr_index], r_row[r_key_attr_index]
                        ]

                    # if out_sim_score flag is set, append the edit distance
                    # score to the output record.
                    if out_sim_score:
                        output_row.append(edit_dist)

                    output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
def _edit_distance_join_split(ltable_list, rtable_list,
                              l_columns, r_columns,
                              l_key_attr, r_key_attr,
                              l_join_attr, r_join_attr,
                              tokenizer, threshold, comp_op,
                              l_out_attrs, r_out_attrs,
                              l_out_prefix, r_out_prefix,
                              out_sim_score, show_progress):
    """Perform edit distance join for a split of ltable and rtable"""
    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    sim_measure_type = 'EDIT_DISTANCE'
    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
                         [ltable_list, rtable_list],
                         [l_join_attr_index, r_join_attr_index],
                         tokenizer, sim_measure_type)

    # cache l_join_attr lengths
    l_join_attr_list = []
    for row in ltable_list:
        l_join_attr_list.append(len(row[l_join_attr_index]))

    # Build prefix index on l_join_attr
    prefix_index = PrefixIndex(ltable_list, l_join_attr_index,
                               tokenizer, sim_measure_type, threshold,
                               token_ordering)
    prefix_index.build(False)

    prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold)

    comp_fn = COMP_OP_MAP[comp_op]
    sim_fn = get_sim_function(sim_measure_type)

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable_list))

    for r_row in rtable_list:
        r_string = r_row[r_join_attr_index]
        r_len = len(r_string)

        r_ordered_tokens = order_using_token_ordering(
                tokenizer.tokenize(r_string), token_ordering)

        # obtain candidates by applying prefix filter. 
        candidates = prefix_filter.find_candidates(r_ordered_tokens,
                                                   prefix_index)

        for cand in candidates:
            if r_len - threshold <= l_join_attr_list[cand] <= r_len + threshold:
                l_row = ltable_list[cand]

                # compute the actual edit distance                           
                edit_dist = sim_fn(l_row[l_join_attr_index], r_string)

                if comp_fn(edit_dist, threshold):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                                         l_row, r_row,
                                         l_key_attr_index, r_key_attr_index,
                                         l_out_attrs_indices,
                                         r_out_attrs_indices)
                    else:
                        output_row = [l_row[l_key_attr_index],
                                      r_row[r_key_attr_index]]

                    # if out_sim_score flag is set, append the edit distance 
                    # score to the output record.
                    if out_sim_score:
                        output_row.append(edit_dist)

                    output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
def _overlap_coefficient_join_split(ltable_list, rtable_list,
                                    l_columns, r_columns,
                                    l_key_attr, r_key_attr,
                                    l_join_attr, r_join_attr,
                                    tokenizer, threshold, comp_op,
                                    allow_empty,
                                    l_out_attrs, r_out_attrs,
                                    l_out_prefix, r_out_prefix,
                                    out_sim_score, show_progress):
    """Perform overlap coefficient join for a split of ltable and rtable"""
    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # Build inverted index over ltable
    inverted_index = InvertedIndex(ltable_list, l_join_attr_index,
                                   tokenizer, cache_size_flag=True)
    # While building the index, we cache the record ids with empty set of 
    # tokens. This is needed to handle the allow_empty flag.
    cached_data = inverted_index.build(allow_empty)
    l_empty_records = cached_data['empty_records']

    overlap_filter = OverlapFilter(tokenizer, 1)
    comp_fn = COMP_OP_MAP[comp_op]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable_list))

    for r_row in rtable_list:
        r_string = r_row[r_join_attr_index]

        r_join_attr_tokens = tokenizer.tokenize(r_string)
        r_num_tokens = len(r_join_attr_tokens)

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the join attribute, then generate output pairs joining   
        # the current rtable record with those records in ltable with empty set 
        # of tokens in the join attribute. These ltable record ids are cached in
        # l_empty_records list which was constructed when building the inverted 
        # index.
        if allow_empty and r_num_tokens == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                     ltable_list[l_id], r_row,
                                     l_key_attr_index, r_key_attr_index,
                                     l_out_attrs_indices,
                                     r_out_attrs_indices)
                else:
                    output_row = [ltable_list[l_id][l_key_attr_index],
                                  r_row[r_key_attr_index]]

                if out_sim_score:
                    output_row.append(1.0)
                output_rows.append(output_row)
            continue

        # probe inverted index and find overlap of candidates 
        candidate_overlap = overlap_filter.find_candidates(
                                r_join_attr_tokens, inverted_index)

        for cand, overlap in iteritems(candidate_overlap):
            # compute the actual similarity score                           
            sim_score = (float(overlap) /
                         float(min(r_num_tokens,
                                   inverted_index.size_cache[cand])))

            if comp_fn(sim_score, threshold):
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                     ltable_list[cand], r_row,
                                     l_key_attr_index, r_key_attr_index,
                                     l_out_attrs_indices, r_out_attrs_indices)
                else:
                    output_row = [ltable_list[cand][l_key_attr_index],
                                  r_row[r_key_attr_index]]

                # if out_sim_score flag is set, append the overlap coefficient 
                # score to the output record.  
                if out_sim_score:
                    output_row.append(sim_score)

                output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
Exemple #4
0
def _overlap_coefficient_join_split(
        ltable_list, rtable_list, l_columns, r_columns, l_key_attr, r_key_attr,
        l_join_attr, r_join_attr, tokenizer, threshold, comp_op, allow_empty,
        l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score,
        show_progress):
    """Perform overlap coefficient join for a split of ltable and rtable"""
    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # Build inverted index over ltable
    inverted_index = InvertedIndex(ltable_list,
                                   l_join_attr_index,
                                   tokenizer,
                                   cache_size_flag=True)
    # While building the index, we cache the record ids with empty set of
    # tokens. This is needed to handle the allow_empty flag.
    cached_data = inverted_index.build(allow_empty)
    l_empty_records = cached_data['empty_records']

    overlap_filter = OverlapFilter(tokenizer, 1)
    comp_fn = COMP_OP_MAP[comp_op]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable_list))

    for r_row in rtable_list:
        r_string = r_row[r_join_attr_index]

        r_join_attr_tokens = tokenizer.tokenize(r_string)
        r_num_tokens = len(r_join_attr_tokens)

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the join attribute, then generate output pairs joining
        # the current rtable record with those records in ltable with empty set
        # of tokens in the join attribute. These ltable record ids are cached in
        # l_empty_records list which was constructed when building the inverted
        # index.
        if allow_empty and r_num_tokens == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                        ltable_list[l_id], r_row, l_key_attr_index,
                        r_key_attr_index, l_out_attrs_indices,
                        r_out_attrs_indices)
                else:
                    output_row = [
                        ltable_list[l_id][l_key_attr_index],
                        r_row[r_key_attr_index]
                    ]

                if out_sim_score:
                    output_row.append(1.0)
                output_rows.append(output_row)
            continue

        # probe inverted index and find overlap of candidates
        candidate_overlap = overlap_filter.find_candidates(
            r_join_attr_tokens, inverted_index)

        for cand, overlap in iteritems(candidate_overlap):
            # compute the actual similarity score
            sim_score = (
                float(overlap) /
                float(min(r_num_tokens, inverted_index.size_cache[cand])))

            if comp_fn(sim_score, threshold):
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                        ltable_list[cand], r_row, l_key_attr_index,
                        r_key_attr_index, l_out_attrs_indices,
                        r_out_attrs_indices)
                else:
                    output_row = [
                        ltable_list[cand][l_key_attr_index],
                        r_row[r_key_attr_index]
                    ]

                # if out_sim_score flag is set, append the overlap coefficient
                # score to the output record.
                if out_sim_score:
                    output_row.append(sim_score)

                output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
def get_pairs_with_missing_value_disk(ltable,
                                      rtable,
                                      l_key_attr,
                                      r_key_attr,
                                      l_join_attr,
                                      r_join_attr,
                                      temp_dir,
                                      data_limit_per_core,
                                      missing_pairs_file_name,
                                      l_out_attrs=None,
                                      r_out_attrs=None,
                                      l_out_prefix='l_',
                                      r_out_prefix='r_',
                                      out_sim_score=False,
                                      show_progress=True):

    # find column indices of key attr, join attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # find ltable records with missing value in l_join_attr
    ltable_missing = ltable[pd.isnull(ltable[l_join_attr])]

    # find ltable records which do not contain missing value in l_join_attr
    ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])]

    # find rtable records with missing value in r_join_attr
    rtable_missing = rtable[pd.isnull(rtable[r_join_attr])]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)

    if show_progress:
        print('Finding pairs with missing value...')
        prog_bar = pyprind.ProgBar(len(ltable_missing) + len(rtable_missing))

    # For each ltable record with missing value in l_join_attr,
    # output a pair corresponding to every record in rtable.
    for l_row in ltable_missing.itertuples(index=False):
        for r_row in rtable.itertuples(index=False):
            if has_output_attributes:
                record = get_output_row_from_tables(l_row, r_row,
                                                    l_key_attr_index,
                                                    r_key_attr_index,
                                                    l_out_attrs_indices,
                                                    r_out_attrs_indices)
            else:
                record = [l_row[l_key_attr_index], r_row[r_key_attr_index]]
            output_rows.append(record)

            # Flushing the data onto the disk if in-memory size exceeds the permissible data limit
            if len(output_rows) > data_limit_per_core:
                df = pd.DataFrame(output_rows)
                with open(missing_pairs_file_name, 'a+') as myfile:
                    df.to_csv(myfile, header=False, index=False)
                output_rows = []

        if show_progress:
            prog_bar.update()

    # if output rows have some data left, flush the same to the disk to maintain consistency.
    if len(output_rows) > 0:
        df = pd.DataFrame(output_rows)
        with open(missing_pairs_file_name, 'a+') as myfile:
            df.to_csv(myfile, header=False, index=False)
        output_rows = []

    # For each rtable record with missing value in r_join_attr,
    # output a pair corresponding to every record in ltable which
    # doesn't have a missing value in l_join_attr.
    for r_row in rtable_missing.itertuples(index=False):
        for l_row in ltable_not_missing.itertuples(index=False):
            if has_output_attributes:
                record = get_output_row_from_tables(l_row, r_row,
                                                    l_key_attr_index,
                                                    r_key_attr_index,
                                                    l_out_attrs_indices,
                                                    r_out_attrs_indices)
            else:
                record = [l_row[l_key_attr_index], r_row[r_key_attr_index]]

            if out_sim_score:
                record.append(np.NaN)

            output_rows.append(record)

            # Flushing the data onto the disk if in-memory size exceeds the permissible data limit
            if len(output_rows) > data_limit_per_core:
                df = pd.DataFrame(output_rows)
                with open(missing_pairs_file_name, 'a+') as myfile:
                    df.to_csv(myfile, header=False, index=False)
                output_rows = []

        if show_progress:
            prog_bar.update()

    # if output rows have some data left, flush the same to the disk to maintain consistency.
    if len(output_rows) > 0:
        df = pd.DataFrame(output_rows)
        with open(missing_pairs_file_name, 'a+') as myfile:
            df.to_csv(myfile, header=False, index=False)
        output_rows = []

    return True
def set_sim_join(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr,
                 l_join_attr, r_join_attr, tokenizer, sim_measure_type,
                 threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs,
                 l_out_prefix, r_out_prefix, out_sim_score, show_progress):
    """Perform set similarity join for a split of ltable and rtable"""

    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
        [ltable, rtable], [l_join_attr_index, r_join_attr_index], tokenizer,
        sim_measure_type)

    # Build position index on l_join_attr
    position_index = PositionIndex(ltable, l_join_attr_index, tokenizer,
                                   sim_measure_type, threshold, token_ordering)
    # While building the index, we cache the tokens and the empty records.
    # We cache the tokens so that we need not tokenize each string in
    # l_join_attr multiple times when we need to compute the similarity measure.
    # Further we cache the empty record ids to handle the allow_empty flag.
    cached_data = position_index.build(allow_empty, cache_tokens=True)
    l_empty_records = cached_data['empty_records']
    cached_l_tokens = cached_data['cached_tokens']

    pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold)

    sim_fn = get_sim_function(sim_measure_type)
    comp_fn = COMP_OP_MAP[comp_op]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable))
    k = 0
    for r_row in rtable:
        r_string = r_row[r_join_attr_index]

        # order the tokens using the token ordering.
        r_ordered_tokens = order_using_token_ordering(
            tokenizer.tokenize(r_string), token_ordering)

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the join attribute, then generate output pairs joining
        # the current rtable record with those records in ltable with empty set
        # of tokens in the join attribute. These ltable record ids are cached in
        # l_empty_records list which was constructed when building the position
        # index.
        if allow_empty and len(r_ordered_tokens) == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                        ltable[l_id], r_row, l_key_attr_index,
                        r_key_attr_index, l_out_attrs_indices,
                        r_out_attrs_indices)
                else:
                    output_row = [
                        ltable[l_id][l_key_attr_index], r_row[r_key_attr_index]
                    ]

                if out_sim_score:
                    output_row.append(1.0)
                output_rows.append(output_row)
            continue

        # obtain candidates by applying position filter.
        candidate_overlap = pos_filter.find_candidates(r_ordered_tokens,
                                                       position_index)

        for cand, overlap in iteritems(candidate_overlap):
            if overlap > 0:
                l_ordered_tokens = cached_l_tokens[cand]
                k += 1
                # compute the actual similarity score
                sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens)

                if comp_fn(sim_score, threshold):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                            ltable[cand], r_row, l_key_attr_index,
                            r_key_attr_index, l_out_attrs_indices,
                            r_out_attrs_indices)
                    else:
                        output_row = [
                            ltable[cand][l_key_attr_index],
                            r_row[r_key_attr_index]
                        ]

                    # if out_sim_score flag is set, append the similarity score
                    # to the output record.
                    if out_sim_score:
                        output_row.append(sim_score)

                    output_rows.append(output_row)

        if show_progress:
            prog_bar.update()
    print 'k : ', k
    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
def get_pairs_with_missing_value(ltable,
                                 rtable,
                                 l_key_attr,
                                 r_key_attr,
                                 l_join_attr,
                                 r_join_attr,
                                 l_out_attrs=None,
                                 r_out_attrs=None,
                                 l_out_prefix='l_',
                                 r_out_prefix='r_',
                                 out_sim_score=False,
                                 show_progress=True):
    # find column indices of key attr, join attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # find ltable records with missing value in l_join_attr
    ltable_missing = ltable[pd.isnull(ltable[l_join_attr])]

    # find ltable records which do not contain missing value in l_join_attr
    ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])]

    # find rtable records with missing value in r_join_attr
    rtable_missing = rtable[pd.isnull(rtable[r_join_attr])]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)

    if show_progress:
        print('Finding pairs with missing value...')
        prog_bar = pyprind.ProgBar(len(ltable_missing) + len(rtable_missing))

    # For each ltable record with missing value in l_join_attr,
    # output a pair corresponding to every record in rtable.
    for l_row in ltable_missing.itertuples(index=False):
        for r_row in rtable.itertuples(index=False):
            if has_output_attributes:
                output_row = get_output_row_from_tables(
                    l_row, r_row, l_key_attr_index, r_key_attr_index,
                    l_out_attrs_indices, r_out_attrs_indices)
            else:
                output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]]
            output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    # For each rtable record with missing value in r_join_attr,
    # output a pair corresponding to every record in ltable which
    # doesn't have a missing value in l_join_attr.
    for r_row in rtable_missing.itertuples(index=False):
        for l_row in ltable_not_missing.itertuples(index=False):
            if has_output_attributes:
                output_row = get_output_row_from_tables(
                    l_row, r_row, l_key_attr_index, r_key_attr_index,
                    l_out_attrs_indices, r_out_attrs_indices)
            else:
                output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]]

            if out_sim_score:
                output_row.append(np.NaN)

            output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)

    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
def _apply_matcher_split(candset,
                         candset_l_key_attr, candset_r_key_attr,
                         ltable, rtable,
                         l_key_attr, r_key_attr,
                         l_match_attr, r_match_attr,
                         tokenizer, sim_function,
                         threshold, comp_op, allow_missing,
                         l_out_attrs, r_out_attrs,
                         l_out_prefix, r_out_prefix,
                         out_sim_score, show_progress, l_tokens, r_tokens):
    # find column indices of key attr, join attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_match_attr_index = l_columns.index(l_match_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_match_attr_index = r_columns.index(r_match_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # Build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_match_attr_index, remove_null=False)

    # Build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_match_attr_index, remove_null=False)

    # Find indices of l_key_attr and r_key_attr in candset
    candset_columns = list(candset.columns.values)
    candset_l_key_attr_index = candset_columns.index(candset_l_key_attr)
    candset_r_key_attr_index = candset_columns.index(candset_r_key_attr)

    comp_fn = COMP_OP_MAP[comp_op]
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None) 

    output_rows = []

    if show_progress:
        prog_bar = pyprind.ProgBar(len(candset))

    tokenize_flag = False
    if tokenizer is not None:
        tokenize_flag =  True
        use_cache = False
        # check if we have cached the tokens.
        if l_tokens is not None and r_tokens is not None:
            use_cache = True

    for candset_row in candset.itertuples(index = False):
        l_id = candset_row[candset_l_key_attr_index]
        r_id = candset_row[candset_r_key_attr_index]

        l_row = ltable_dict[l_id]
        r_row = rtable_dict[r_id]
        
        l_apply_col_value = l_row[l_match_attr_index]
        r_apply_col_value = r_row[r_match_attr_index]

        allow_pair = False
        # Check if one of the inputs is missing. If yes, check the allow_missing
        # flag. If it is True, then add the pair to output. Else, continue.
        # If none of the input is missing, then proceed to apply the 
        # sim_function. 
        if pd.isnull(l_apply_col_value) or pd.isnull(r_apply_col_value):
            if allow_missing:
                allow_pair = True
                sim_score = pd.np.NaN
            else:
                continue   
        else:
            if tokenize_flag:
                # If we have cached the tokens, we use it directly. Else, we
                # tokenize the values.
                if use_cache:
                    l_apply_col_value = l_tokens[l_id]
                    r_apply_col_value = r_tokens[r_id]
                else:
                    l_apply_col_value = tokenizer.tokenize(l_apply_col_value)
                    r_apply_col_value = tokenizer.tokenize(r_apply_col_value)
        
            sim_score = sim_function(l_apply_col_value, r_apply_col_value)
            allow_pair = comp_fn(sim_score, threshold)

        if allow_pair: 
            if has_output_attributes:
                output_row = get_output_row_from_tables(
                                 l_row, r_row,
                                 l_key_attr_index, r_key_attr_index,
                                 l_out_attrs_indices,
                                 r_out_attrs_indices)
                output_row.insert(0, candset_row[0])
            else:
                output_row = [candset_row[0], l_id, r_id]
            if out_sim_score:
                output_row.append(sim_score)
            output_rows.append(output_row)

        if show_progress:                    
            prog_bar.update()

    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    output_header.insert(0, '_id')
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
def _filter_tables_split(ltable, rtable,
                         l_columns, r_columns,
                         l_key_attr, r_key_attr,
                         l_filter_attr, r_filter_attr,
                         suffix_filter,
                         l_out_attrs, r_out_attrs,
                         l_out_prefix, r_out_prefix, show_progress):
    # find column indices of key attr, filter attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_filter_attr_index = l_columns.index(l_filter_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, filter attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_filter_attr_index = r_columns.index(r_filter_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)
        
    # generate token ordering using tokens in l_filter_attr and r_filter_attr
    token_ordering = gen_token_ordering_for_tables(
                                [ltable, rtable],
                                [l_filter_attr_index, r_filter_attr_index],
                                suffix_filter.tokenizer,
                                suffix_filter.sim_measure_type)

    # ignore allow_empty flag for OVERLAP and EDIT_DISTANCE measures.           
    handle_empty = (suffix_filter.allow_empty and
        suffix_filter.sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE'])

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(ltable))

    for l_row in ltable:
        l_string = l_row[l_filter_attr_index]

        ltokens = suffix_filter.tokenizer.tokenize(l_string)
        ordered_ltokens = order_using_token_ordering(ltokens, token_ordering)
        l_num_tokens = len(ordered_ltokens)
        l_prefix_length = get_prefix_length(l_num_tokens,
                                            suffix_filter.sim_measure_type,
                                            suffix_filter.threshold,
                                            suffix_filter.tokenizer)

        l_suffix = ordered_ltokens[l_prefix_length:]
        for r_row in rtable:
            r_string = r_row[r_filter_attr_index]

            rtokens = suffix_filter.tokenizer.tokenize(r_string)
            ordered_rtokens = order_using_token_ordering(rtokens,
                                                         token_ordering)
            r_num_tokens = len(ordered_rtokens)

            # If allow_empty flag is set, then add the pair to the output.
            if handle_empty and l_num_tokens == 0 and r_num_tokens == 0:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                         l_row, r_row,
                                         l_key_attr_index, r_key_attr_index,
                                         l_out_attrs_indices,
                                         r_out_attrs_indices)
                else:
                    output_row = [l_row[l_key_attr_index],
                                  r_row[r_key_attr_index]]

                output_rows.append(output_row)
                continue

            r_prefix_length = get_prefix_length(r_num_tokens,
                                                suffix_filter.sim_measure_type,
                                                suffix_filter.threshold,
                                                suffix_filter.tokenizer)

            if l_prefix_length <= 0 or r_prefix_length <= 0:
                continue

            if not suffix_filter._filter_suffix(l_suffix,
                                           ordered_rtokens[r_prefix_length:],
                                           l_prefix_length, r_prefix_length,
                                           l_num_tokens, r_num_tokens):
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                         l_row, r_row,
                                         l_key_attr_index, r_key_attr_index, 
                                         l_out_attrs_indices,
                                         r_out_attrs_indices)
                else:
                    output_row = [l_row[l_key_attr_index],
                                  r_row[r_key_attr_index]]

                output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(
                            l_key_attr, r_key_attr,
                            l_out_attrs, r_out_attrs, 
                            l_out_prefix, r_out_prefix)

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
def get_pairs_with_missing_value_disk(ltable, rtable,
                                      l_key_attr, r_key_attr,
                                      l_join_attr, r_join_attr,
                                      temp_dir, data_limit_per_core,
                                      missing_pairs_file_name, l_out_attrs=None,
                                      r_out_attrs=None, l_out_prefix='l_',
                                      r_out_prefix='r_', out_sim_score=False,
                                      show_progress=True):

    # find column indices of key attr, join attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)
   
    # find ltable records with missing value in l_join_attr
    ltable_missing = ltable[pd.isnull(ltable[l_join_attr])]

    # find ltable records which do not contain missing value in l_join_attr
    ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])]

    # find rtable records with missing value in r_join_attr
    rtable_missing = rtable[pd.isnull(rtable[r_join_attr])]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        print('Finding pairs with missing value...')
        prog_bar = pyprind.ProgBar(len(ltable_missing) + len(rtable_missing))

    # For each ltable record with missing value in l_join_attr,
    # output a pair corresponding to every record in rtable.
    for l_row in ltable_missing.itertuples(index=False):
        for r_row in rtable.itertuples(index=False):
            if has_output_attributes:
                record = get_output_row_from_tables(
                                 l_row, r_row,
                                 l_key_attr_index, r_key_attr_index,
                                 l_out_attrs_indices, r_out_attrs_indices)
            else:
                record = [l_row[l_key_attr_index], r_row[r_key_attr_index]]
            output_rows.append(record)

            # Flushing the data onto the disk if in-memory size exceeds the permissible data limit
            if len(output_rows) > data_limit_per_core:
                df = pd.DataFrame(output_rows)
                with open(missing_pairs_file_name, 'a+') as myfile:
                    df.to_csv(myfile, header=False, index=False)
                output_rows = []

        if show_progress:
            prog_bar.update()

    # if output rows have some data left, flush the same to the disk to maintain consistency.
    if len(output_rows) > 0:
        df = pd.DataFrame(output_rows)
        with open(missing_pairs_file_name, 'a+') as myfile:
            df.to_csv(myfile, header=False, index=False)
        output_rows = []

    # For each rtable record with missing value in r_join_attr,
    # output a pair corresponding to every record in ltable which 
    # doesn't have a missing value in l_join_attr.
    for r_row in rtable_missing.itertuples(index=False):
        for l_row in ltable_not_missing.itertuples(index=False):
            if has_output_attributes:
                record = get_output_row_from_tables(
                                 l_row, r_row,
                                 l_key_attr_index, r_key_attr_index,
                                 l_out_attrs_indices, r_out_attrs_indices)
            else:
                record = [l_row[l_key_attr_index], r_row[r_key_attr_index]]

            if out_sim_score:
                record.append(pd.np.NaN)

            output_rows.append(record)

            # Flushing the data onto the disk if in-memory size exceeds the permissible data limit
            if len(output_rows) > data_limit_per_core:
                df = pd.DataFrame(output_rows)
                with open(missing_pairs_file_name, 'a+') as myfile:
                    df.to_csv(myfile, header=False, index=False)
                output_rows = []

        if show_progress:
            prog_bar.update()

    # if output rows have some data left, flush the same to the disk to maintain consistency.
    if len(output_rows) > 0:
        df = pd.DataFrame(output_rows)
        with open(missing_pairs_file_name, 'a+') as myfile:
            df.to_csv(myfile, header=False, index=False)
        output_rows = []

    return True
def _filter_tables_split(ltable, rtable,
                         l_columns, r_columns,
                         l_key_attr, r_key_attr,
                         l_filter_attr, r_filter_attr,
                         size_filter, 
                         l_out_attrs, r_out_attrs,
                         l_out_prefix, r_out_prefix, show_progress):
    # find column indices of key attr, filter attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_filter_attr_index = l_columns.index(l_filter_attr)
    l_out_attrs_indices = []
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, filter attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_filter_attr_index = r_columns.index(r_filter_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # ignore allow_empty flag for OVERLAP and EDIT_DISTANCE measures.
    handle_empty = (size_filter.allow_empty and 
        size_filter.sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE'])

    # Build size index over ltable
    size_index = SizeIndex(ltable, l_filter_attr_index,
                           size_filter.tokenizer)
    # While building the index, we cache the record ids with empty set of 
    # tokens. This is needed to handle the allow_empty flag.
    cached_data = size_index.build(handle_empty)
    l_empty_records = cached_data['empty_records']

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable))

    for r_row in rtable:
        r_string = r_row[r_filter_attr_index]

        r_num_tokens = len(size_filter.tokenizer.tokenize(r_string))

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the filter attribute, then generate output pairs joining   
        # the current rtable record with those records in ltable with empty set 
        # of tokens in the filter attribute. These ltable record ids are cached 
        # in l_empty_records list which was constructed when building the size 
        # index.
        if handle_empty and r_num_tokens == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                     ltable[l_id], r_row,
                                     l_key_attr_index, r_key_attr_index,
                                     l_out_attrs_indices,
                                     r_out_attrs_indices)
                else:
                    output_row = [ltable[l_id][l_key_attr_index],
                                  r_row[r_key_attr_index]]

                output_rows.append(output_row)
            continue
           
        # probe size index and find candidates
        candidates = size_filter.find_candidates(r_num_tokens, size_index)

        for cand in candidates:
            if has_output_attributes:
                output_row = get_output_row_from_tables(
                                     ltable[cand], r_row,
                                     l_key_attr_index, r_key_attr_index, 
                                     l_out_attrs_indices, r_out_attrs_indices)
            else:
                output_row = [ltable[cand][l_key_attr_index],
                              r_row[r_key_attr_index]]

            output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs, 
                                                  l_out_prefix, r_out_prefix)

    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
Exemple #12
0
def _filter_tables_split(ltable, rtable, l_columns, r_columns, l_key_attr,
                         r_key_attr, l_filter_attr, r_filter_attr, size_filter,
                         l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix,
                         show_progress):
    # find column indices of key attr, filter attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_filter_attr_index = l_columns.index(l_filter_attr)
    l_out_attrs_indices = []
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, filter attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_filter_attr_index = r_columns.index(r_filter_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # ignore allow_empty flag for OVERLAP and EDIT_DISTANCE measures.
    handle_empty = (size_filter.allow_empty and size_filter.sim_measure_type
                    not in ['OVERLAP', 'EDIT_DISTANCE'])

    # Build size index over ltable
    size_index = SizeIndex(ltable, l_filter_attr_index, size_filter.tokenizer)
    # While building the index, we cache the record ids with empty set of
    # tokens. This is needed to handle the allow_empty flag.
    cached_data = size_index.build(handle_empty)
    l_empty_records = cached_data['empty_records']

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable))

    for r_row in rtable:
        r_string = r_row[r_filter_attr_index]

        r_num_tokens = len(size_filter.tokenizer.tokenize(r_string))

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the filter attribute, then generate output pairs joining
        # the current rtable record with those records in ltable with empty set
        # of tokens in the filter attribute. These ltable record ids are cached
        # in l_empty_records list which was constructed when building the size
        # index.
        if handle_empty and r_num_tokens == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                        ltable[l_id], r_row, l_key_attr_index,
                        r_key_attr_index, l_out_attrs_indices,
                        r_out_attrs_indices)
                else:
                    output_row = [
                        ltable[l_id][l_key_attr_index], r_row[r_key_attr_index]
                    ]

                output_rows.append(output_row)
            continue

        # probe size index and find candidates
        candidates = size_filter.find_candidates(r_num_tokens, size_index)

        for cand in candidates:
            if has_output_attributes:
                output_row = get_output_row_from_tables(
                    ltable[cand], r_row, l_key_attr_index, r_key_attr_index,
                    l_out_attrs_indices, r_out_attrs_indices)
            else:
                output_row = [
                    ltable[cand][l_key_attr_index], r_row[r_key_attr_index]
                ]

            output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)

    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
def get_pairs_with_missing_value(ltable, rtable,
                                 l_key_attr, r_key_attr,
                                 l_join_attr, r_join_attr,
                                 l_out_attrs=None, r_out_attrs=None,
                                 l_out_prefix='l_', r_out_prefix='r_',
                                 out_sim_score=False, show_progress=True):
    # find column indices of key attr, join attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)
   
    # find ltable records with missing value in l_join_attr
    ltable_missing = ltable[pd.isnull(ltable[l_join_attr])]

    # find ltable records which do not contain missing value in l_join_attr
    ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])]

    # find rtable records with missing value in r_join_attr
    rtable_missing = rtable[pd.isnull(rtable[r_join_attr])]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        print('Finding pairs with missing value...')
        prog_bar = pyprind.ProgBar(len(ltable_missing) + len(rtable_missing))

    # For each ltable record with missing value in l_join_attr,
    # output a pair corresponding to every record in rtable.
    for l_row in ltable_missing.itertuples(index=False):
        for r_row in rtable.itertuples(index=False):
            if has_output_attributes:
                output_row = get_output_row_from_tables(
                                 l_row, r_row,
                                 l_key_attr_index, r_key_attr_index,
                                 l_out_attrs_indices, r_out_attrs_indices)
            else:
                output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]]
            output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    # For each rtable record with missing value in r_join_attr,
    # output a pair corresponding to every record in ltable which 
    # doesn't have a missing value in l_join_attr.
    for r_row in rtable_missing.itertuples(index=False):
        for l_row in ltable_not_missing.itertuples(index=False):
            if has_output_attributes:
                output_row = get_output_row_from_tables(
                                 l_row, r_row,
                                 l_key_attr_index, r_key_attr_index,
                                 l_out_attrs_indices, r_out_attrs_indices)
            else:
                output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]]

            if out_sim_score:
                output_row.append(pd.np.NaN)

            output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)

    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table    
def _filter_tables_split(ltable, rtable,
                         l_columns, r_columns,
                         l_key_attr, r_key_attr,
                         l_filter_attr, r_filter_attr,
                         overlap_filter,
                         l_out_attrs, r_out_attrs,
                         l_out_prefix, r_out_prefix,
                         out_sim_score, show_progress):
    # Find column indices of key attr, filter attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_filter_attr_index = l_columns.index(l_filter_attr)
    l_out_attrs_indices = []
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # Find column indices of key attr, filter attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_filter_attr_index = r_columns.index(r_filter_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # Build inverted index over ltable
    inverted_index = InvertedIndex(ltable, l_filter_attr_index,
                                   overlap_filter.tokenizer)
    inverted_index.build(False)

    comp_fn = COMP_OP_MAP[overlap_filter.comp_op]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable))

    for r_row in rtable:
        r_string = r_row[r_filter_attr_index]
        r_filter_attr_tokens = overlap_filter.tokenizer.tokenize(r_string)

        # probe inverted index and find overlap of candidates          
        candidate_overlap = overlap_filter.find_candidates(
                                r_filter_attr_tokens, inverted_index)

        for cand, overlap in iteritems(candidate_overlap):
            if comp_fn(overlap, overlap_filter.overlap_size):
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                     ltable[cand], r_row,
                                     l_key_attr_index, r_key_attr_index, 
                                     l_out_attrs_indices, r_out_attrs_indices)
                else:
                    output_row = [ltable[cand][l_key_attr_index],
                                  r_row[r_key_attr_index]]

                if out_sim_score:
                    output_row.append(overlap)
                output_rows.append(output_row)
 
        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
def set_sim_join(ltable, rtable,
                 l_columns, r_columns,
                 l_key_attr, r_key_attr,
                 l_join_attr, r_join_attr,
                 tokenizer, sim_measure_type, threshold, comp_op,
                 allow_empty,
                 l_out_attrs, r_out_attrs,
                 l_out_prefix, r_out_prefix,
                 out_sim_score, show_progress):
    """Perform set similarity join for a split of ltable and rtable"""

    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
                         [ltable, rtable],
                         [l_join_attr_index, r_join_attr_index],
                         tokenizer, sim_measure_type)

    # Build position index on l_join_attr
    position_index = PositionIndex(ltable, l_join_attr_index,
                                   tokenizer, sim_measure_type,
                                   threshold, token_ordering)
    # While building the index, we cache the tokens and the empty records.
    # We cache the tokens so that we need not tokenize each string in 
    # l_join_attr multiple times when we need to compute the similarity measure.
    # Further we cache the empty record ids to handle the allow_empty flag.
    cached_data = position_index.build(allow_empty, cache_tokens=True)
    l_empty_records = cached_data['empty_records']
    cached_l_tokens = cached_data['cached_tokens']

    pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold)

    sim_fn = get_sim_function(sim_measure_type)
    comp_fn = COMP_OP_MAP[comp_op]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable))
    k = 0
    for r_row in rtable:
        r_string = r_row[r_join_attr_index]

        # order the tokens using the token ordering.
        r_ordered_tokens = order_using_token_ordering(
                tokenizer.tokenize(r_string), token_ordering)

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the join attribute, then generate output pairs joining 
        # the current rtable record with those records in ltable with empty set 
        # of tokens in the join attribute. These ltable record ids are cached in
        # l_empty_records list which was constructed when building the position
        # index.
        if allow_empty and len(r_ordered_tokens) == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                     ltable[l_id], r_row,
                                     l_key_attr_index, r_key_attr_index,
                                     l_out_attrs_indices,
                                     r_out_attrs_indices)
                else:
                    output_row = [ltable[l_id][l_key_attr_index],
                                  r_row[r_key_attr_index]]

                if out_sim_score:
                    output_row.append(1.0)
                output_rows.append(output_row)
            continue

        # obtain candidates by applying position filter.            
        candidate_overlap = pos_filter.find_candidates(r_ordered_tokens,
                                                       position_index)

        for cand, overlap in iteritems(candidate_overlap):
            if overlap > 0:
                l_ordered_tokens = cached_l_tokens[cand]
                k += 1
                # compute the actual similarity score
                sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens)

                if comp_fn(sim_score, threshold):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                                         ltable[cand], r_row,
                                         l_key_attr_index, r_key_attr_index,
                                         l_out_attrs_indices,
                                         r_out_attrs_indices)
                    else:
                        output_row = [ltable[cand][l_key_attr_index],
                                      r_row[r_key_attr_index]]

                    # if out_sim_score flag is set, append the similarity score    
                    # to the output record.  
                    if out_sim_score:
                        output_row.append(sim_score)

                    output_rows.append(output_row)

        if show_progress:
            prog_bar.update()
    print 'k : ', k
    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
def _apply_matcher_split(candset, candset_l_key_attr, candset_r_key_attr,
                         ltable, rtable, l_key_attr, r_key_attr, l_match_attr,
                         r_match_attr, tokenizer, sim_function, threshold,
                         comp_op, allow_missing, l_out_attrs, r_out_attrs,
                         l_out_prefix, r_out_prefix, out_sim_score,
                         show_progress, l_tokens, r_tokens):
    # find column indices of key attr, join attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_match_attr_index = l_columns.index(l_match_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_match_attr_index = r_columns.index(r_match_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # Build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable,
                                        l_key_attr_index,
                                        l_match_attr_index,
                                        remove_null=False)

    # Build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable,
                                        r_key_attr_index,
                                        r_match_attr_index,
                                        remove_null=False)

    # Find indices of l_key_attr and r_key_attr in candset
    candset_columns = list(candset.columns.values)
    candset_l_key_attr_index = candset_columns.index(candset_l_key_attr)
    candset_r_key_attr_index = candset_columns.index(candset_r_key_attr)

    comp_fn = COMP_OP_MAP[comp_op]
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)

    output_rows = []

    if show_progress:
        prog_bar = pyprind.ProgBar(len(candset))

    tokenize_flag = False
    if tokenizer is not None:
        tokenize_flag = True
        use_cache = False
        # check if we have cached the tokens.
        if l_tokens is not None and r_tokens is not None:
            use_cache = True

    for candset_row in candset.itertuples(index=False):
        l_id = candset_row[candset_l_key_attr_index]
        r_id = candset_row[candset_r_key_attr_index]

        l_row = ltable_dict[l_id]
        r_row = rtable_dict[r_id]

        l_apply_col_value = l_row[l_match_attr_index]
        r_apply_col_value = r_row[r_match_attr_index]

        allow_pair = False
        # Check if one of the inputs is missing. If yes, check the allow_missing
        # flag. If it is True, then add the pair to output. Else, continue.
        # If none of the input is missing, then proceed to apply the
        # sim_function.
        if pd.isnull(l_apply_col_value) or pd.isnull(r_apply_col_value):
            if allow_missing:
                allow_pair = True
                sim_score = pd.np.NaN
            else:
                continue
        else:
            if tokenize_flag:
                # If we have cached the tokens, we use it directly. Else, we
                # tokenize the values.
                if use_cache:
                    l_apply_col_value = l_tokens[l_id]
                    r_apply_col_value = r_tokens[r_id]
                else:
                    l_apply_col_value = tokenizer.tokenize(l_apply_col_value)
                    r_apply_col_value = tokenizer.tokenize(r_apply_col_value)

            sim_score = sim_function(l_apply_col_value, r_apply_col_value)
            allow_pair = comp_fn(sim_score, threshold)

        if allow_pair:
            if has_output_attributes:
                output_row = get_output_row_from_tables(
                    l_row, r_row, l_key_attr_index, r_key_attr_index,
                    l_out_attrs_indices, r_out_attrs_indices)
                output_row.insert(0, candset_row[0])
            else:
                output_row = [candset_row[0], l_id, r_id]
            if out_sim_score:
                output_row.append(sim_score)
            output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    output_header.insert(0, '_id')
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
Exemple #17
0
def _filter_tables_split(ltable, rtable,
                         l_columns, r_columns,
                         l_key_attr, r_key_attr,
                         l_filter_attr, r_filter_attr,
                         position_filter,
                         l_out_attrs, r_out_attrs,
                         l_out_prefix, r_out_prefix, show_progress):
    # find column indices of key attr, filter attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_filter_attr_index = l_columns.index(l_filter_attr)
    l_out_attrs_indices = []
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, filter attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_filter_attr_index = r_columns.index(r_filter_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # generate token ordering using tokens in l_filter_attr and r_filter_attr
    token_ordering = gen_token_ordering_for_tables(
                                 [ltable, rtable],
                                 [l_filter_attr_index, r_filter_attr_index],
                                 position_filter.tokenizer,
                                 position_filter.sim_measure_type)

    # ignore allow_empty flag for OVERLAP and EDIT_DISTANCE measures.           
    handle_empty = (position_filter.allow_empty and
        position_filter.sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE'])

    # Build position index on l_filter_attr
    position_index = PositionIndex(ltable, l_filter_attr_index,
                                   position_filter.tokenizer,
                                   position_filter.sim_measure_type,
                                   position_filter.threshold, token_ordering)
    # While building the index, we cache the record ids with empty set of 
    # tokens. This is needed to handle the allow_empty flag.
    cached_data = position_index.build(handle_empty)
    l_empty_records = cached_data['empty_records']

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable))

    for r_row in rtable:
        r_string = r_row[r_filter_attr_index]

        r_filter_attr_tokens = position_filter.tokenizer.tokenize(r_string)
        r_ordered_tokens = order_using_token_ordering(r_filter_attr_tokens,
                                                      token_ordering)

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the filter attribute, then generate output pairs joining   
        # the current rtable record with those records in ltable with empty set 
        # of tokens in the filter attribute. These ltable record ids are cached 
        # in l_empty_records list which was constructed when building the 
        # position index. 
        if handle_empty and len(r_ordered_tokens) == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                     ltable[l_id], r_row,
                                     l_key_attr_index, r_key_attr_index,
                                     l_out_attrs_indices,
                                     r_out_attrs_indices)
                else:
                    output_row = [ltable[l_id][l_key_attr_index],
                                  r_row[r_key_attr_index]]

                output_rows.append(output_row)
            continue

        candidate_overlap = position_filter.find_candidates(
                                r_ordered_tokens, position_index)

        for cand, overlap in iteritems(candidate_overlap):
            if overlap > 0:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                     ltable[cand], r_row,
                                     l_key_attr_index, r_key_attr_index, 
                                     l_out_attrs_indices, r_out_attrs_indices)
                else:
                    output_row = [ltable[cand][l_key_attr_index],
                                  r_row[r_key_attr_index]]

                output_rows.append(output_row)

        if show_progress:                    
            prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs, 
                                                  l_out_prefix, r_out_prefix)

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table