コード例 #1
0
 def setUp(self):
     self.A = em.read_csv_metadata(path_for_A)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_for_B)
     em.set_key(self.B, 'ID')
     self.C = em.read_csv_metadata(path_for_C, ltable=self.A, rtable=self.B)
     self.feature_table = em.get_features_for_matching(self.A, self.B, validate_inferred_attr_types=False)
     self.brm = em.BooleanRuleMatcher()
コード例 #2
0
 def setUp(self):
     self.A = em.read_csv_metadata(path_for_A)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_for_B)
     em.set_key(self.B, 'ID')
     self.C = em.read_csv_metadata(path_for_C, ltable=self.A, rtable=self.B)
     self.feature_table = em.get_features_for_matching(self.A, self.B, validate_inferred_attr_types=False)
     self.C['neg_trig_labels'] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
     self.C['pos_trig_labels'] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
     self.mt = em.MatchTrigger()
コード例 #3
0
def extract_features_auto(ltable_df, rtable_df, candset_df):
    feature_list = em.get_features_for_matching(ltable_df,rtable_df,validate_inferred_attr_types=False)
    #Remove all features based on id - they are often useless
    feature_list = feature_list[feature_list.left_attribute !='id']

    print("\n\nExtracting the full set of features:")
    candset_features_df = em.extract_feature_vecs(candset_df,feature_table=feature_list,attrs_after='gold',show_progress=True)
    candset_features_df.fillna(value=0, inplace=True)

    return candset_features_df
コード例 #4
0
ファイル: pipeline.py プロジェクト: is4ac/cs839-data-science
def main():
    A = em.read_csv_metadata('../Data/A_imdb.csv', key='id')
    B = em.read_csv_metadata('../Data/B_tmdb.csv', key='id')
    ab = em.AttrEquivalenceBlocker()
    shared_attributes = ['title', 'directors', 'release_year', 'languages']
    C = ab.block_tables(A,
                        B,
                        'directors',
                        'directors',
                        l_output_attrs=shared_attributes,
                        r_output_attrs=shared_attributes)
    # Take a sample of 10 pairs
    S = em.sample_table(C, 100)
    print(S)
    G = em.label_table(S, label_column_name='gold_labels')
    train_test = em.split_train_test(G, train_proportion=0.5)
    train, test = train_test['train'], train_test['test']
    # Get feature for matching
    match_f = em.get_features_for_matching(A, B)
    H = em.extract_feature_vecs(train,
                                attrs_before=['ltable_title', 'rtable_title'],
                                feature_table=match_f,
                                attrs_after=['gold_labels'])
    H.fillna(value=0, inplace=True)
    print(H)
    # Specifying Matchers and Performing Matching.
    dt = em.DTMatcher(max_depth=5)  # A decision tree matcher.
    # Train the matcher
    dt.fit(table=H,
           exclude_attrs=[
               '_id', 'ltable_id', 'rtable_id', 'ltable_title', 'rtable_title',
               'gold_labels'
           ],
           target_attr='gold_labels')
    # Predict
    F = em.extract_feature_vecs(test,
                                attrs_before=['ltable_title', 'rtable_title'],
                                feature_table=match_f,
                                attrs_after=['gold_labels'])
    F.fillna(value=0, inplace=True)
    print(F)
    pred_table = dt.predict(table=F,
                            exclude_attrs=[
                                '_id', 'ltable_id', 'rtable_id',
                                'ltable_title', 'rtable_title', 'gold_labels'
                            ],
                            target_attr='predicted_labels',
                            return_probs=True,
                            probs_attr='proba',
                            append=True,
                            inplace=True)
    print(pred_table)
    eval_summary = em.eval_matches(pred_table, 'gold_labels',
                                   'predicted_labels')
    em.print_eval_summary(eval_summary)
コード例 #5
0
 def setUp(self):
     self.A = em.read_csv_metadata(path_for_A)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_for_B)
     em.set_key(self.B, 'ID')
     self.C = em.read_csv_metadata(path_for_C, ltable=self.A, rtable=self.B)
     self.feature_table = em.get_features_for_matching(
         self.A, self.B, validate_inferred_attr_types=False)
     self.C['neg_trig_labels'] = [
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
     ]
     self.C['pos_trig_labels'] = [
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
     ]
     self.mt = em.MatchTrigger()
コード例 #6
0
def predict_matching_tuples(A, B, C, G):
    # Split G into I and J for CV
    IJ = em.split_train_test(G, train_proportion=0.5, random_state=0)
    I = IJ['train']
    # Generate features set F
    F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)
    # Convert G to a set of feature vectors using F
    H = em.extract_feature_vecs(I,
                                feature_table=F,
                                attrs_after='label',
                                show_progress=False)
    excluded_attributes = ['_id', 'l_id', 'r_id', 'label']
    # Fill in missing values with column's average
    H = em.impute_table(H, exclude_attrs=excluded_attributes, strategy='mean')
    # Create and train a logistic regression - the best matcher from stage3.
    lg = em.LogRegMatcher(name='LogReg', random_state=0)
    lg.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    # Convert C into a set of features using F
    L = em.extract_feature_vecs(C, feature_table=F, show_progress=False)
    # Fill in missing values with column's average
    L = em.impute_table(L,
                        exclude_attrs=['_id', 'l_id', 'r_id'],
                        strategy='mean')
    # Predict on L with trained matcher
    predictions = lg.predict(table=L,
                             exclude_attrs=['_id', 'l_id', 'r_id'],
                             append=True,
                             target_attr='predicted',
                             inplace=False,
                             return_probs=False,
                             probs_attr='proba')
    # Extract the matched pairs' ids
    matched_pairs = predictions[predictions.predicted == 1]
    matched_ids = matched_pairs[['l_id', 'r_id']]
    # Save matched_pairs to file so we don't have to train and predict each time the code is executed
    matched_ids.to_csv(FOLDER + 'predictedMatchedIDs.csv', index=False)
コード例 #7
0
import logging
logging.basicConfig(level=logging.INFO)
print("Mem. usage before reading:{0} (GB)".format(
    psutil.virtual_memory().used / 1e9))
A = em.read_csv_metadata('../datasets/sample_msd_100k.csv', key='id')
B = em.read_csv_metadata('../datasets/sample_msd_100k.csv', key='id')
C = em.read_csv_metadata('../datasets/candset_msd_300k.csv',
                         key='_id',
                         ltable=A,
                         rtable=B,
                         fk_ltable='l_id',
                         fk_rtable='r_id')
print("Mem. usage after reading:{0} (GB)".format(psutil.virtual_memory().used /
                                                 1e9))
len(C)

memUsageBefore = psutil.virtual_memory().used / 1e9
timeBefore = time.time()
feature_table = em.get_features_for_matching(A, B)

memUsageBefore = psutil.virtual_memory().used / 1e9
timeBefore = time.time()
feature_vecs = em.extract_feature_vecs(C, feature_table=feature_table)
timeAfter = time.time()
memUsageAfter = psutil.virtual_memory().used / 1e9

print(
    'Mem.usage (after reading): {0} (GB), Mem.usage (after extract featvecs): {1} (GB), diff: {2} (GB)'
    .format(memUsageBefore, memUsageAfter, memUsageAfter - memUsageBefore))
print('Time. diff: {0} (secs)'.format(timeAfter - timeBefore))
コード例 #8
0
ファイル: match.py プロジェクト: wkfunk/838_project
em.set_key(tracks, 'id')

labeled_candidates = em.read_csv_metadata('data/golden_labeled_all_missing_removed.csv', key = '_id',ltable = songs, rtable = tracks, fk_ltable='ltable_id', fk_rtable='rtable_id')

IJ = em.split_train_test(labeled_candidates, train_proportion=0.7, random_state=0)
I = IJ['train']
J = IJ['test']

dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')

feature_table = em.get_features_for_matching(songs, tracks)

H = em.extract_feature_vecs(I, 
                            feature_table=feature_table, 
                            attrs_after='gold',
                            show_progress=False)
							
result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'],
        k=5,
        target_attr='gold', metric='precision', random_state=0)
		
L = em.extract_feature_vecs(J, feature_table=feature_table,
                            attrs_after='gold', show_progress=False)

lg.fit(table=H, 
コード例 #9
0
def automatic_feature_gen(candidate_table, feature_cols, id_names,
                          id_names_phrase):
    '''
    NB!
    The automatic function creates pairwise features. Consequently, it will convert
    internally the colnames in lhs and rhs portions of feature cols to the SAME name.
    It does this by trimming the `id_names_phrase` portion (suffix or prefix) from each column name
    It assumes that the id names are of the form id_{id_names_phrase} e.g. id_amzn

    Replaces Nans in candidate table with empty strings

    Takes in a single DataFrame object (lhs_table and rhs_table concatenated) and
    splits it into two tables then generates features on each of the sub tables.

    Inputs:
            candidate_table: single Pandas DataFrame (typically output of blocking_algorithms.py functions)

    Outputs:
    '''

    em.del_catalog()
    candidate_table = candidate_table.reset_index()

    lhs_table = candidate_table.loc[:, feature_cols[0] + [id_names[0]]]
    rhs_table = candidate_table.loc[:, feature_cols[1] + [id_names[1]]]

    lhs_colnames = []
    for colname in lhs_table:
        if colname != id_names[0]:
            lhs_colnames.append(re.sub(id_names_phrase[0], "", colname))
        else:
            lhs_colnames.append(colname)
    rhs_colnames = []
    for colname in rhs_table:
        if colname != id_names[1]:
            rhs_colnames.append(re.sub(id_names_phrase[1], "", colname))
        else:
            rhs_colnames.append(colname)

    lhs_table.columns = lhs_colnames
    rhs_table.columns = rhs_colnames
    # To circumvent the same product ID coming up again (due to it being in multiple candidate comparisons)
    lhs_table["index_num_lhs"] = np.arange(lhs_table.shape[0])
    rhs_table["index_num_rhs"] = np.arange(rhs_table.shape[0])

    em.set_key(lhs_table, "index_num_lhs")  # changed from id_names
    em.set_key(rhs_table, "index_num_rhs")
    # Generate List Of Features
    matching_features = em.get_features_for_matching(
        lhs_table.drop(id_names[0], axis=1),
        rhs_table.drop(id_names[1], axis=1),
        validate_inferred_attr_types=False)
    # Extract feature vectors and save as a  DF
    # Set primary keys and foreign keys for candidate table
    candidate_table["index"] = np.arange(candidate_table.shape[0])
    # Add foreign keys to candidate table
    candidate_table["index_num_lhs"] = np.arange(lhs_table.shape[0])
    candidate_table["index_num_rhs"] = np.arange(rhs_table.shape[0])

    em.set_key(candidate_table, "index")
    em.set_fk_ltable(candidate_table, "index_num_lhs")
    em.set_fk_rtable(candidate_table, "index_num_rhs")
    em.set_ltable(candidate_table, lhs_table)
    em.set_rtable(candidate_table, rhs_table)

    matching_features_df = em.extract_feature_vecs(
        candidate_table, feature_table=matching_features, show_progress=False)

    matching_features_df = em.impute_table(
        matching_features_df,
        exclude_attrs=['index', "index_num_lhs", "index_num_rhs"],
        strategy='mean')
    # add back the amzn and google ids
    matching_features_df["id_amzn"] = candidate_table.id_amzn
    matching_features_df["id_g"] = candidate_table.id_g

    matching_features_df = matching_features_df.fillna(value=0)

    # print(matching_features_df.describe())
    # print(f"Number na {matching_features_df.isna().apply(sum)}")
    # print(f"Number null {matching_features_df.isnull().apply(sum)}")
    return matching_features_df
コード例 #10
0
def main():
    # WELCOME TO MY MAGELLAN RUN SCRIPT
    print("\n-------------WELCOME TO MY MAGELLAN RUN SCRIPT-------------\n")

    # Get the datasets directory
    datasets_dir = 'B:\McMaster\CAS 764 - Advance Topics in Data Management\Project\Data\\'
    print("- Dataset directory: " + datasets_dir)
    print("- List of folders/files: ")
    print(os.listdir(datasets_dir))
    print("- Please enter new dataset folder name:")
    datasets_dir += input()
    print("- Dataset directory set to: " + datasets_dir)

    dateset_dir_files = os.listdir(datasets_dir)
    print("- List of files in dataset folder: ")
    print(dateset_dir_files)

    # Get the path of the input table A
    print("- Enter an index for Table A file (0-x):")
    file_index_A = input()
    filename_A = dateset_dir_files[int(file_index_A)]
    print("Table A file set to: " + filename_A)

    # Get the path of the input table
    path_A = datasets_dir + os.sep + filename_A

    # Get the path of the input table B
    print("- Enter an index for Table B file (0-x):")
    file_index_B = input()
    filename_B = dateset_dir_files[int(file_index_B)]
    print("Table B file set to: " + filename_B)

    # Get the path of the input table
    path_B = datasets_dir + os.sep + filename_B

    # Print Table A column names
    A = em.read_csv_metadata(path_A)
    print("- List of columns of Table A: ")
    print(list(A.columns))
    # Get the Table A id/primary key column name
    print('- Enter Table A primary key column index (ex. 0):')
    pk_A_index = input()
    pk_A = A.columns[int(pk_A_index)]

    # Print Table B column names
    B = em.read_csv_metadata(path_B)
    print("- List of columns of Table B: ")
    print(list(B.columns))
    # Get the Table B id/primary key column name
    print('- Enter Table B primary key column index (ex. 0):')
    pk_B_index = input()
    pk_B = A.columns[int(pk_A_index)]

    # READING TABLES AND SETTING METADATA
    print("\n-------------READING TABLES AND SETTING METADATA-------------\n")

    # Both read csv and set metadata id as ID column
    #A = em.read_csv_metadata(path_A, key=pk_A)
    #B = em.read_csv_metadata(path_B, key=pk_B)
    em.set_key(A, pk_A)
    em.set_key(B, pk_B)

    # Number of tables
    print('- Number of tuples in A: ' + str(len(A)))
    print('- Number of tuples in B: ' + str(len(B)))
    print('- Number of tuples in A X B (i.e the cartesian product): ' +
          str(len(A) * len(B)))

    # Print first 5 tuples of tables
    print(A.head())
    print(B.head())

    # Display the keys of the input tables
    print("- Table A primary key: " + em.get_key(A))
    print("- Table B primary key: " + em.get_key(B))

    # DOWNSAMPLING
    print("\n-------------DOWNSAMPING-------------\n")

    print("- Do you want to use downsampling? (y or n):")
    print("- Table A: " + str(len(A)) + ", Table B: " + str(len(B)))
    print("- NOTE: Recommended if both tables have 100K+ tuples.")
    is_downsample = input()
    if (is_downsample == 'y'):
        print("- Size of the downsampled tables (ex. 200):")
        downsample_size = input()
        # If the tables are large we can downsample the tables like this
        A1, B1 = em.down_sample(A, B, downsample_size, 1, show_progress=False)
        print("- Length of Table A1" + len(A1))
        print("- Length of Table B1" + len(B1))

    # BLOCKING
    print("\n-------------BLOCKING-------------\n")

    print("- Do you want to use blocking? (y or n):")
    is_blocking = input()
    if (is_blocking == 'y'):

        # Check if the 2 tables column names are the same
        if (list(A.columns) == list(B.columns)):
            C_attr_eq = []  # Attr Equ blocker result list
            C_overlap = []  # Overlap blocker result list
            C_blackbox = []  # BlackBox blocker result list

            # Left and right table attribute prefixes
            l_prefix = "ltable_"
            r_prefix = "rtable_"

            print("\n- List of columns: ")
            print(list(A.columns))
            # Labeling output table column selection
            print(
                "\n- Enter the indexes of columns that you want to see in labeling table (0-"
                + str(len(A.columns) - 1) + "):")
            out_attr = []
            for i in range(1, len(A.columns)):
                print("- Finish with empty character(enter+enter) " + str(i))
                add_to_attr = input()
                if (add_to_attr == ''):
                    break
                # Get indexes from user and add columns into out_attr list
                out_attr.append(A.columns[int(add_to_attr)])

            # Print output attributes
            print(out_attr)

            # Loop for adding/combining new blockers
            while (True):
                # Blocker selection
                print(
                    "\n- Do yo want to use Attribute Equivalence[ab] (same), Overlap[ob] (similar) or Blackbox[bb] blocker:"
                )
                blocker_selection = input()

                # ----- Attribute Equivalence Blocker -----
                if (blocker_selection == 'ab'):
                    # Create attribute equivalence blocker
                    ab = em.AttrEquivalenceBlocker()
                    # Counter for indexes
                    attr_eq_counter = 0
                    # Check if Overlap Blocker used before
                    if (C_overlap and not C_overlap[-1].empty):
                        print(
                            "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_attr_eq.append(
                                C_overlap[-1])  # Add last output of ob
                            attr_eq_counter += 1  # For skipping block_table function in first time

                    # Check if BlackBox Blocker used before
                    if (C_blackbox and not C_blackbox[-1].empty):
                        print(
                            "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_attr_eq.append(
                                C_blackbox[-1])  # Add last output of ob
                            attr_eq_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into Attr Equ blocker
                    while (True):
                        # List column names
                        print("\n- List of columns: ")
                        print(list(A.columns))
                        # Get blocking attribute/column
                        print(
                            "\n- Which column (w/ index) to use for equivalence blocking? (ex. 1):"
                        )
                        blocking_col_index = input()
                        blocking_col = A.columns[int(blocking_col_index)]

                        print(
                            "\n- Do you want to add missing values into blocking? (y or n):"
                        )
                        add_missing_val = input()
                        if (add_missing_val == 'y'):
                            add_missing_val = True
                        else:
                            add_missing_val = False

                        # First time using Attr Equ blocker, use A and B
                        if (attr_eq_counter == 0):
                            # Block using selected (blocking_col) attribute on A and B
                            C_attr_eq.append(
                                ab.block_tables(A,
                                                B,
                                                blocking_col,
                                                blocking_col,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                allow_missing=add_missing_val,
                                                n_jobs=-1))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block using selected (blocking_col) attribute on previous (last=-1) candidate set
                            C_attr_eq.append(
                                ab.block_candset(C_attr_eq[-1],
                                                 l_block_attr=blocking_col,
                                                 r_block_attr=blocking_col,
                                                 allow_missing=add_missing_val,
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print(
                            "\n- Attribute Equivalence Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_attr_eq[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        attr_eq_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_attr_eq[-1])))
                        print(
                            "- Add another column into Attribute Equivalence Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        ab_next_operation = input()
                        if (not ab_next_operation.islower()):
                            ab_next_operation = ab_next_operation.lower(
                            )  # Lower case
                        # Continue using Attribute Equivalence Blocker
                        if (ab_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (ab_next_operation == 'r'):
                            del C_attr_eq[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use Attribute Equivalence Blocker (y or n):"
                            )
                            ab_next_operation = input()
                            if (ab_next_operation == 'n'):
                                break
                        # Finish Attribute Equivalence Blocker
                        else:
                            break

                # ----- Overlap Blocker -----
                elif (blocker_selection == 'ob'):
                    # Create attribute equivalence blocker
                    ob = em.OverlapBlocker()
                    # Counter for indexes
                    overlap_counter = 0
                    # Check if Attribute Equivalence Blocker used before
                    if (C_attr_eq and not C_attr_eq[-1].empty):
                        print(
                            "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_overlap.append(
                                C_attr_eq[-1])  # Add last output of ab
                            overlap_counter += 1  # For skipping block_table function in first time

                    # Check if BlackBox Blocker used before
                    if (C_blackbox and not C_blackbox[-1].empty):
                        print(
                            "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_overlap.append(
                                C_blackbox[-1])  # Add last output of ob
                            overlap_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into Overlap blocker
                    while (True):
                        # List column names
                        print("- List of columns: ")
                        print(list(A.columns))
                        # Get blocking attribute/column
                        print(
                            "- Which column (w/ index) to use for overlap blocking? (ex. 1):"
                        )
                        blocking_col_index = input()
                        blocking_col = A.columns[int(blocking_col_index)]

                        print(
                            "\n- Do you want to add missing values into blocking? (y or n):"
                        )
                        add_missing_val = input()
                        if (add_missing_val == 'y'):
                            add_missing_val = True
                        else:
                            add_missing_val = False

                        print("\n- Use words as a token? (y or n):")
                        use_world_level = input()
                        if (use_world_level == 'y'):
                            use_world_level = True
                            q_gram_value = None
                        else:
                            use_world_level = False
                            print(
                                "\n- Q-gram q value (ex. 2 --> JO HN SM IT H):"
                            )
                            q_gram_value = input()
                            q_gram_value = int(q_gram_value)

                        print(
                            "\n- Enter the overlap size (# of tokens that overlap):"
                        )
                        overlap_size = input()
                        overlap_size = int(overlap_size)

                        print(
                            "\n- Do you want to remove (a, an, the) from token set? (y or n):"
                        )
                        use_stop_words = input()
                        if (use_stop_words == 'y'):
                            use_stop_words = True
                        else:
                            use_stop_words = False

                        # First time using Overlap blocker, use A and B
                        if (overlap_counter == 0):
                            # Block using selected (blocking_col) attribute on A and B
                            C_overlap.append(
                                ob.block_tables(A,
                                                B,
                                                blocking_col,
                                                blocking_col,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                rem_stop_words=use_stop_words,
                                                q_val=q_gram_value,
                                                word_level=use_world_level,
                                                overlap_size=overlap_size,
                                                allow_missing=add_missing_val,
                                                n_jobs=-1))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block using selected (blocking_col) attribute on previous (last=-1) candidate set
                            C_overlap.append(
                                ob.block_candset(C_overlap[-1],
                                                 l_overlap_attr=blocking_col,
                                                 r_overlap_attr=blocking_col,
                                                 rem_stop_words=use_stop_words,
                                                 q_val=q_gram_value,
                                                 word_level=use_world_level,
                                                 overlap_size=overlap_size,
                                                 allow_missing=add_missing_val,
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print("\n- Overlap Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_overlap[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        overlap_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_overlap[-1])))
                        print(
                            "- Add another column into Overlap Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        ob_next_operation = input()
                        if (not ob_next_operation.islower()):
                            ob_next_operation = ob_next_operation.lower(
                            )  # Lower case
                        # Continue using Overlap Blocker
                        if (ob_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (ob_next_operation == 'r'):
                            del C_overlap[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use Overlap Blocker (y or n):")
                            ob_next_operation = input()
                            if (ob_next_operation == 'n'):
                                break
                        # Finish Overlap Blocker
                        else:
                            break

                # ----- BlackBox Blocker -----
                elif (blocker_selection == 'bb'):
                    # Create attribute equivalence blocker
                    bb = em.BlackBoxBlocker()
                    # Counter for indexes
                    blackbox_counter = 0
                    # Check if Overlap Blocker used before
                    if (C_attr_eq and not C_attr_eq[-1].empty):
                        print(
                            "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_blackbox.append(
                                C_attr_eq[-1])  # Add last output of ob
                            blackbox_counter += 1  # For skipping block_table function in first time

                    # Check if Overlap Blocker used before
                    if (C_overlap and not C_overlap[-1].empty):
                        print(
                            "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_blackbox.append(
                                C_overlap[-1])  # Add last output of ob
                            blackbox_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into BlackBox blocker
                    while (True):
                        # Set function
                        bb.set_black_box_function(
                            number_10_percent_comparision)

                        # First time using Overlap blocker, use A and B
                        if (overlap_counter == 0):
                            # Block on A and B
                            C_blackbox.append(
                                bb.block_tables(A,
                                                B,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                n_jobs=-1,
                                                show_progress=False))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block on previous (last=-1) candidate set
                            C_blackbox.append(
                                bb.block_candset(C_blackbox[-1],
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print("\n- BlackBox Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_blackbox[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        blackbox_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_blackbox[-1])))
                        print(
                            "- Add another column into BlackBox Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        bb_next_operation = input()
                        if (not bb_next_operation.islower()):
                            bb_next_operation = bb_next_operation.lower(
                            )  # Lower case
                        # Continue using Overlap Blocker
                        if (bb_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (bb_next_operation == 'r'):
                            del C_blackbox[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use BlackBox Blocker (y or n):")
                            bb_next_operation = input()
                            if (bb_next_operation == 'n'):
                                break
                        # Finish BlackBox Blocker
                        else:
                            break

                print("\n- Do you want to add/use another blocker? (y or n):")
                blocker_decision = input()
                if (blocker_decision == 'n'):
                    break

            print(
                "\n- Which blocker output you want to use? (Attr Equ-ab, Overlap-ob, BlackBox-bb, Union-un)"
            )
            blocker_output_selection = input()
            # Attribute Equ
            if (blocker_output_selection == "ab"):
                C = C_attr_eq[-1]
            # Overlap
            elif (blocker_output_selection == "ob"):
                C = C_overlap[-1]
                # Overlap
            elif (blocker_output_selection == "bb"):
                C = C_blackbox[-1]
            # Union of blockers
            elif (blocker_output_selection == "un"):
                # Combine/union blockers candidate sets
                print("\n- TODO: Unions Attr Equ and Overlap only!")
                if (C_attr_eq and C_overlap and not C_attr_eq[-1].empty and
                        not C_overlap[-1].empty):  # Both blocker types used
                    C = em.combine_blocker_outputs_via_union(
                        [C_attr_eq[-1], C_overlap[-1]])
                    print(
                        "\n- Blockers candidate set outputs combined via union."
                    )
                else:  # Error
                    C = []
                    print(
                        "\n- ERROR: Candidate set C is empty! Check blockers' results."
                    )
            # Error
            else:
                C = []
                print(
                    "\n- ERROR: Candidate set C is empty! Check blockers' results."
                )
            print("\n- Length of C: " + str(len(C)))

        else:
            print(
                "\n- 2 Tables column names are different, they must be the same"
            )
            print(list(A.columns))
            print(list(B.columns))

    # SAMPLING&LABELING
    print("\n-------------SAMPLING&LABELING-------------\n")

    print("- Choose sampling size (eg. 450):")
    sampling_size = input()
    while (int(sampling_size) > len(C)):
        print("- Sampling size cannot be bigger than " + str(len(C)))
        sampling_size = input()

    # Sample  candidate set
    S = em.sample_table(C, int(sampling_size))

    print("- New window will pop-up for " + sampling_size + " sized table.")
    print("- If there is a match, change tuple's label value to 1")

    # Label S
    G = em.label_table(S, 'label')

    #DEVELOPMENT AND EVALUATION
    print("\n-------------DEVELOPMENT AND EVALUATION-------------\n")

    # Split S into development set (I) and evaluation set (J)
    IJ = em.split_train_test(G, train_proportion=0.7, random_state=0)
    I = IJ['train']
    J = IJ['test']

    #SELECTING THE BEST MATCHER
    print("\n-------------SELECTING THE BEST MATCHER-------------\n")

    # Create a set of ML-matchers
    dt = em.DTMatcher(name='DecisionTree', random_state=0)
    svm = em.SVMMatcher(name='SVM', random_state=0)
    rf = em.RFMatcher(name='RF', random_state=0)
    lg = em.LogRegMatcher(name='LogReg', random_state=0)
    ln = em.LinRegMatcher(name='LinReg')
    nb = em.NBMatcher(name='NaiveBayes')

    print(
        "\n- 6 different ML-matchers created: DL, SVM, RF, LogReg, LinReg, NB")

    print("\n- Creating features...")
    # Generate features
    feature_table = em.get_features_for_matching(
        A, B, validate_inferred_attr_types=False)

    print("\n- Features list:")
    # List the names of the features generated
    print(feature_table['feature_name'])

    print("\n- Converting the development set to feature vectors...")
    # Convert the I into a set of feature vectors using feature_table
    H = em.extract_feature_vecs(I,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    print("\n- Feature table first rows:")
    # Display first few rows
    print(H.head())

    # Primary key of tables = prefix + pk = l_id, r_id
    ltable_pk = l_prefix + pk_A
    rtable_pk = r_prefix + pk_B

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(H))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        H = em.impute_table(
            H,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean',
            val_all_nans=0.0)
        #print("\n- Feature table first rows:")
        # Display first few rows
        #print(H.head())
        print("- Impute table function used for missing values.")

    print("\n- Selecting the best matcher using cross-validation...")
    # Select the best ML matcher using CV
    result = em.select_matcher(
        matchers=[dt, rf, svm, ln, lg, nb],
        table=H,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        k=5,
        target_attr='label',
        metric_to_select_matcher='f1',
        random_state=0)
    print("\n- Results:")
    print(result['cv_stats'])

    #DEBUGGING THE MATCHER
    print("\n-------------DEBUGGING THE MATCHER-------------\n")

    #  Split feature vectors into train and test
    UV = em.split_train_test(H, train_proportion=0.5)
    U = UV['train']
    V = UV['test']

    # Debug decision tree using GUI
    em.vis_debug_rf(rf,
                    U,
                    V,
                    exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
                    target_attr='label')

    print("\n- Do you want to add another feature?")

    H = em.extract_feature_vecs(I,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(H))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        H = em.impute_table(
            H,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean')
        print("\n- Feature table first rows:")
        # Display first few rows
        print(H.head())

    # Select the best ML matcher using CV
    result = em.select_matcher(
        [dt, rf, svm, ln, lg, nb],
        table=H,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        k=5,
        target_attr='label',
        metric_to_select_matcher='f1',
        random_state=0)

    print("\n- Results:")
    print(result['cv_stats'])

    #EVALUATING THE MATCHING OUTPUT
    print("\n-------------EVALUATING THE MATCHING OUTPUT-------------\n")

    print("\n- Converting the evaluation set to feature vectors...")
    # Convert J into a set of feature vectors using feature table
    L = em.extract_feature_vecs(J,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(L))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        L = em.impute_table(
            L,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean')
        print("\n- Feature table first rows:")
        # Display first few rows
        print(L.head())

    print("\n- Training the selected matcher...")
    # Train using feature vectors from I
    rf.fit(table=H,
           exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
           target_attr='label')

    print("\n- Predicting the matches...")
    # Predict on L
    predictions = rf.predict(
        table=L,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        append=True,
        target_attr='predicted',
        inplace=False)

    print("\n- Evaluating the prediction...")
    # Evaluate the predictions
    eval_result = em.eval_matches(predictions, 'label', 'predicted')
    print(em.print_eval_summary(eval_result))

    print("\n- Time elapsed:")
    print(datetime.now() - startTime)

    print("\n-------------END-------------\n")
コード例 #11
0
ファイル: solution.py プロジェクト: dyyfk/cs639_spring19
def main():

    A = em.read_csv_metadata('ltable.csv',
                             key="ltable_id",
                             encoding='ISO-8859-1')
    B = em.read_csv_metadata('rtable.csv',
                             key="rtable_id",
                             encoding='ISO-8859-1')

    ob = em.OverlapBlocker()
    C = ob.block_tables(
        A,
        B,
        'title',
        'title',
        l_output_attrs=['title', 'category', 'brand', 'modelno', 'price'],
        r_output_attrs=['title', 'category', 'brand', 'modelno', 'price'],
        overlap_size=1,
        show_progress=False)
    S = em.sample_table(C, 450)

    G = em.read_csv_metadata("train.csv",
                             key='id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='ltable_id',
                             fk_rtable='rtable_id')
    feature_table = em.get_features_for_matching(
        A, B, validate_inferred_attr_types=False)
    G = em.label_table(S, 'label')

    attrs_from_table = [
        'ltable_title', 'ltable_category', 'ltable_brand', 'ltable_modelno',
        'ltable_price', 'rtable_title', 'rtable_category', 'rtable_brand',
        'rtable_modelno', 'rtable_price'
    ]
    H = em.extract_feature_vecs(G,
                                feature_table=feature_table,
                                attrs_before=attrs_from_table,
                                attrs_after='label',
                                show_progress=False)
    H.fillna('0', inplace=True)
    #     H = em.impute_table(
    #         H, exclude_attrs=['_id', 'ltable_ltable_id', 'rtable_rtable_id','label'], strategy='mean')
    rf = em.RFMatcher()

    attrs_to_be_excluded = []
    attrs_to_be_excluded.extend(
        ['_id', 'ltable_ltable_id', 'rtable_rtable_id', 'label'])
    attrs_to_be_excluded.extend(attrs_from_table)

    rf.fit(table=H, exclude_attrs=attrs_to_be_excluded, target_attr='label')

    attrs_from_table = [
        'ltable_title', 'ltable_category', 'ltable_brand', 'ltable_modelno',
        'ltable_price', 'rtable_title', 'rtable_category', 'rtable_brand',
        'rtable_modelno', 'rtable_price'
    ]
    L = em.extract_feature_vecs(C,
                                feature_table=feature_table,
                                attrs_before=attrs_from_table,
                                show_progress=False,
                                n_jobs=-1)

    attrs_to_be_excluded = []
    attrs_to_be_excluded.extend(
        ['_id', 'ltable_ltable_id', 'rtable_rtable_id'])
    attrs_to_be_excluded.extend(attrs_from_table)

    predictions = rf.predict(table=L,
                             exclude_attrs=attrs_to_be_excluded,
                             append=True,
                             target_attr='predicted',
                             inplace=False)

    dataset = pd.DataFrame({"id": G[0]['id'], 'label': predictions['label']})
    dataset.to_csv("./prediction2.csv", index=False)
コード例 #12
0
ファイル: merge.py プロジェクト: is4ac/cs839-data-science
def main():
    # Read in data files
    A = em.read_csv_metadata(FOLDER + 'A.csv', key='id')  # imdb data
    B = em.read_csv_metadata(FOLDER + 'B.csv', key='id')  # tmdb data
    G = em.read_csv_metadata(FOLDER + 'G.csv',
                             key='_id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='l_id',
                             fk_rtable='r_id')  # labeled data
    # Split G into I and J for CV
    IJ = em.split_train_test(G, train_proportion=0.5, random_state=0)
    I = IJ['train']
    # Generate features set F
    F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)
    # Convert I to a set of feature vectors using F
    H = em.extract_feature_vecs(I,
                                feature_table=F,
                                attrs_after='label',
                                show_progress=False)
    excluded_attributes = ['_id', 'l_id', 'r_id', 'label']
    # Fill in missing values with column's average
    H = em.impute_table(H, exclude_attrs=excluded_attributes, strategy='mean')
    # Create and train a logistic regression - the best matcher from stage3.
    lg = em.LogRegMatcher(name='LogReg', random_state=0)
    lg.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    # Read in the candidate tuple pairs.
    C = em.read_csv_metadata(FOLDER + 'C.csv',
                             key='_id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='l_id',
                             fk_rtable='r_id')  # labeled data
    # Convert C into a set of features using F
    L = em.extract_feature_vecs(C, feature_table=F, show_progress=False)
    # Fill in missing values with column's average
    L = em.impute_table(L,
                        exclude_attrs=['_id', 'l_id', 'r_id'],
                        strategy='mean')
    # Predict on L with trained matcher
    predictions = lg.predict(table=L,
                             exclude_attrs=['_id', 'l_id', 'r_id'],
                             append=True,
                             target_attr='predicted',
                             inplace=False,
                             return_probs=False,
                             probs_attr='proba')
    # Output the merged table (Basically what matches).
    # We start with rows from A that matches.
    # We then merge value from B into A.
    matched_pairs = predictions[predictions.predicted == 1]
    left_ids = matched_pairs['l_id'].to_frame()
    left_ids.columns = ['id']
    merged = pd.merge(A, left_ids, on='id')
    merged.set_index('id', inplace=True)
    B.set_index('id', inplace=True)
    black_list = {'a872', 'a987'}
    for pair in matched_pairs.itertuples():
        aid = pair.l_id
        bid = pair.r_id
        if (aid in black_list):
            continue
        # Title: keep title from A, if title from B is not an exact matched
        # from A, append B’s title to the alternative title field if B’s title
        # is not already in A’s alternative title.
        m_title = merged.loc[aid, 'title']
        a_title = merged.loc[aid, 'title']
        b_title = B.loc[bid, 'title']
        if (b_title != a_title):
            if pd.isnull(merged.loc[aid, 'alternative_titles']):
                merged.loc[aid, 'alternative_titles'] = b_title
            else:
                alt = set(merged.loc[aid, 'alternative_titles'].split(';'))
                if (b_title not in alt):
                    merged.loc[aid, 'alternative_titles'] += ';' + b_title
        for col in [
                'directors', 'writers', 'cast', 'genres', 'keywords',
                'languages', 'production_companies', 'production_countries'
        ]:
            merged.loc[aid, col] = merge_cell(merged.loc[aid, col], B.loc[bid,
                                                                          col])
        # Content rating: keep A
        # Release year: keep A
        # Opening_weekend_revenue: keep A
        # Run time
        m_runtime = int(
            (merged.loc[aid, 'run_time'] + B.loc[bid, 'run_time']) / 2)
        merged.loc[aid, 'run_time'] = m_runtime
        # Budget and Revenue
        for col in ['budget', 'revenue']:
            merged.loc[aid, col] = merge_money(merged.loc[aid, col],
                                               B.loc[bid, col])
        # Rating: take the average after converting B rating to scale 10.
        m_rating = (merged.loc[aid, 'rating'] + 0.1 * B.loc[bid, 'rating']) / 2
        merged.loc[aid, 'rating'] = m_rating
    merged.to_csv(FOLDER + 'E.csv', index=True)
コード例 #13
0
            'instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
            'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
            'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title'
        ],
        r_output_attrs=[
            'instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
            'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
            'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title'
        ],
        overlap_size=1,
        show_progress=True,
        l_output_prefix='left_',
        r_output_prefix='right_',
    )
    # Get features
    feature_table = em.get_features_for_matching(
        A, B, validate_inferred_attr_types=False)

    # Get the attributes to be projected while predicting
    attrs_from_table = [
        'left_brand', 'left_cpu_brand', 'left_cpu_model', 'left_cpu_type',
        'left_cpu_frequency', 'left_ram_capacity', 'left_ram_type',
        'left_ram_frequency', 'left_hdd_capacity', 'left_ssd_capacity',
        'left_weight', 'left_dimensions', 'left_title', 'right_brand',
        'right_cpu_brand', 'right_cpu_model', 'right_cpu_type',
        'right_cpu_frequency', 'right_ram_capacity', 'right_ram_type',
        'right_ram_frequency', 'right_hdd_capacity', 'right_ssd_capacity',
        'right_weight', 'right_dimensions', 'right_title'
    ]
    attrs_to_be_excluded = []
    attrs_to_be_excluded.extend(
        ['_id', 'left_instance_id', 'right_instance_id'])
コード例 #14
0
# Creating the Rule-Based Matcher
import py_entitymatching as em
em.set_key(hotels, 'id')
em.set_key(booking, 'id')
em.set_key(C, '_id')
em.set_ltable(C, hotels)
em.set_rtable(C, booking)
em.set_fk_rtable(C, 'r_id')
em.set_fk_ltable(C, 'l_id')

brm = em.BooleanRuleMatcher()

booking.dtypes
# Generate a set of features
F = em.get_features_for_matching(hotels, booking, validate_inferred_attr_types=False)
F.feature_name


#row = 45
#lev.get_sim_score(C.iloc[[row]]['l_norm_name'][row],C.iloc[[row]]['r_norm_name'][row])

# excute sim score
#C['pred_label']=0
#for row in range(C.shape[0]):
#    t = lev.get_sim_score(C.iloc[[row]]['l_norm_name'][row],C.iloc[[row]]['r_norm_name'][row])
#    if t > 0.5:    
#        C['pred_label'][row] = 1
#C = C.loc[C['pred_label'] == 1]
    
コード例 #15
0
                         ltable=A,
                         rtable=B,
                         fk_ltable='ltable_id',
                         fk_rtable='rtable_id')

# Load the labeled data
G = em.read_csv_metadata('S_labeled.csv',
                         key='_id',
                         ltable=A,
                         rtable=B,
                         fk_ltable='ltable_id',
                         fk_rtable='rtable_id')

# Generate a set of features
F = em.get_features_for_matching(A.iloc[:, 1:8],
                                 B.iloc[:, 1:8],
                                 validate_inferred_attr_types=False)

# Create a feature on the value of (price + rating), then compute Levenshtein similarity
sim = em.get_sim_funs_for_matching()
tok = em.get_tokenizers_for_matching()
feature_string = """lev_sim(wspace(float(ltuple['price']) + float(ltuple['rating'])), 
                            wspace(float(rtuple['price']) + float(rtuple['rating'])))"""
feature = em.get_feature_fn(feature_string, sim, tok)

# Add feature to F
em.add_feature(F, 'lev_ws_price+rating', feature)

# Convert the sample set into a set of feature vectors using F
H = em.extract_feature_vecs(G,
                            feature_table=F,
コード例 #16
0
em.to_csv_metadata(D, 'datasets/tbl_blocked_8.csv')
tbl_blocked = em.read_csv_metadata('datasets/tbl_blocked_8.csv',\
 ltable=sample_movies, rtable=sample_tracks)

S = em.sample_table(tbl_blocked, 400)
em.to_csv_metadata(S, 'datasets/sampled_8.csv')

with open('metadata_8.csv', 'wb') as data1:
    writer = csv.writer(data1, delimiter=',', quotechar='|')
    for entry in S.values:
        l = len(entry)
        item = entry[-4:]
        writer.writerow(item)

match_f = em.get_features_for_matching(sample_movies, sample_tracks)
H = em.extract_feature_vecs(S, feature_table=match_f)
with open('data_8.csv', 'wb') as data:
    writer = csv.writer(data, delimiter=',', quotechar='|')
    flag = 0
    names = []
    for idx, row in H.iterrows():
        item = []
        print row
        for it in row.iteritems():
            if flag:
                names.append(it[0])
            item.append(it[1])
        flag = 1
        writer.writerow(item)
    print names
コード例 #17
0
def workflow(path_A, path_B, path_labeled):

    # Load csv files as dataframes and set the key attribute in the dataframe
    A = em.read_csv_metadata(path_A, key='ID')
    B = em.read_csv_metadata(path_B, key='ID')

    # Run attribute equivalence blocker on brand
    ab = em.AttrEquivalenceBlocker()
    C1 = ab.block_tables(A,
                         B,
                         'Brand',
                         'Brand',
                         l_output_attrs=[
                             'Name', 'Price', 'Brand', 'Screen Size', 'RAM',
                             'Hard Drive Capacity', 'Processor Type',
                             'Processor Speed', 'Operating System',
                             'Clean Name'
                         ],
                         r_output_attrs=[
                             'Name', 'Price', 'Brand', 'Screen Size', 'RAM',
                             'Hard Drive Capacity', 'Processor Type',
                             'Processor Speed', 'Operating System',
                             'Clean Name'
                         ])

    # Get features for rule based blocking
    block_f = em.get_features_for_blocking(A,
                                           B,
                                           validate_inferred_attr_types=False)

    # Run rule based blocker with rule for jaccard score on Clean Name column
    rb = em.RuleBasedBlocker()
    rb.add_rule(
        ['Clean_Name_Clean_Name_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.2'],
        block_f)
    C2 = rb.block_candset(C1)

    # Run black box blocker to compare screen size, ram, and hard drive capacity
    bb_screen = em.BlackBoxBlocker()
    bb_screen.set_black_box_function((screen_ram_hd_equal))
    C = bb_screen.block_candset(C2)

    # Load the labeled data
    L = em.read_csv_metadata(path_labeled,
                             key='_id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='ltable_ID',
                             fk_rtable='rtable_ID')

    # Generate features
    feature_table = em.get_features_for_matching(
        A, B, validate_inferred_attr_types=False)
    feature_subset = feature_table.iloc[np.r_[4:10, 40:len(feature_table)], :]
    em.add_blackbox_feature(feature_subset, 'refurbished', refurbished)

    # Extract feature vectors
    feature_vectors_dev = em.extract_feature_vecs(L,
                                                  feature_table=feature_subset,
                                                  attrs_after='gold')

    # Impute feature vectors with the mean of the column values.
    feature_vectors_dev = em.impute_table(
        feature_vectors_dev,
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'],
        strategy='mean')

    # Train using feature vectors from the labeled data
    matcher = em.RFMatcher(name='RF')
    matcher.fit(table=feature_vectors_dev,
                exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'],
                target_attr='gold')

    # Extract feature vectors for the rest of the data
    feature_vectors = em.extract_feature_vecs(C, feature_table=feature_subset)

    # Impute feature vectors with the mean of the column values.
    feature_vectors = em.impute_table(
        feature_vectors,
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'],
        strategy='mean')

    # Make predictions for the whole data set
    predictions = matcher.predict(
        table=feature_vectors,
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'],
        append=True,
        target_attr='predicted',
        inplace=False)
    predictions = predictions.loc[:, [
        '_id', 'ltable_ID', 'rtable_ID', 'predicted'
    ]]

    return predictions[predictions['predicted'] == 1]
コード例 #18
0
dt = em.DTMatcher()
rf = em.RFMatcher()
nb = em.NBMatcher()
logreg = em.LogRegMatcher()
linreg = em.LinRegMatcher()
svm = em.SVMMatcher()

# Creating features
# ------------------

# Next, we need to create a set of features for the development set. py_entitymatching provides a way to automatically generate features based on the attributes in the input tables. We drop the unwanted features from the feature table

# In[113]:

## Generate features
match_f = em.get_features_for_matching(A, B)
match_f.drop([13, 14, 15, 16], inplace=True)

# In[114]:

# List the names of the features generated
match_f['feature_name']

# Converting the development set to feature vectors
# ------------------

# In[116]:

# Convert the I into a set of feature vectors using F

H = em.extract_feature_vecs(I,
コード例 #19
0
ファイル: matcher.py プロジェクト: is4ac/cs839-data-science
def main():
    # Read in data files
    A = em.read_csv_metadata(FOLDER + 'A.csv', key='id')  # imdb data
    B = em.read_csv_metadata(FOLDER + 'B.csv', key='id')  # tmdb data
    G = em.read_csv_metadata(FOLDER + 'G.csv',
                             key='_id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='l_id',
                             fk_rtable='r_id')  # labeled data
    # Split G into I and J for CV
    IJ = em.split_train_test(G, train_proportion=0.5, random_state=0)
    I = IJ['train']
    J = IJ['test']
    # Save I and J to files
    I.to_csv(FOLDER + 'I.csv', index=False)
    J.to_csv(FOLDER + 'J.csv', index=False)
    # Generate features set F
    F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)
    #print(F.feature_name)
    #print(type(F))
    # Convert I to a set of feature vectors using F
    H = em.extract_feature_vecs(I,
                                feature_table=F,
                                attrs_after='label',
                                show_progress=False)
    #print(H.head)
    # Check of missing values
    #print(any(pd.notnull(H)))
    excluded_attributes = ['_id', 'l_id', 'r_id', 'label']
    # Fill in missing values with column's average
    H = em.impute_table(H, exclude_attrs=excluded_attributes, strategy='mean')
    # Create a set of matchers
    dt = em.DTMatcher(name='DecisionTree', random_state=0)
    svm = em.SVMMatcher(name='SVM', random_state=0)
    rf = em.RFMatcher(name='RF', random_state=0)
    lg = em.LogRegMatcher(name='LogReg', random_state=0)
    ln = em.LinRegMatcher(name='LinReg')
    nb = em.NBMatcher(name='NaiveBayes')
    # Selecting best matcher with CV using F1-score as criteria
    CV_result = em.select_matcher([dt, rf, svm, ln, lg, nb],
                                  table=H,
                                  exclude_attrs=excluded_attributes,
                                  k=10,
                                  target_attr='label',
                                  metric_to_select_matcher='f1',
                                  random_state=0)
    print(CV_result['cv_stats'])  # RF is the best matcher
    # Train matchers on H
    dt.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    rf.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    svm.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    lg.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    ln.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    nb.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    # Convert J into a set of features using F
    L = em.extract_feature_vecs(J,
                                feature_table=F,
                                attrs_after='label',
                                show_progress=False)
    # Fill in missing values with column's average
    L = em.impute_table(L, exclude_attrs=excluded_attributes, strategy='mean')
    # Predict on L with trained matchers
    predictions_dt = dt.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    predictions_rf = rf.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    predictions_svm = svm.predict(table=L,
                                  exclude_attrs=excluded_attributes,
                                  append=True,
                                  target_attr='predicted',
                                  inplace=False,
                                  return_probs=False,
                                  probs_attr='proba')
    predictions_lg = lg.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    predictions_ln = ln.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    predictions_nb = nb.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    # Evaluate predictions
    dt_eval = em.eval_matches(predictions_dt, 'label', 'predicted')
    em.print_eval_summary(dt_eval)
    rf_eval = em.eval_matches(predictions_rf, 'label', 'predicted')
    em.print_eval_summary(rf_eval)
    svm_eval = em.eval_matches(predictions_svm, 'label', 'predicted')
    em.print_eval_summary(svm_eval)
    lg_eval = em.eval_matches(predictions_lg, 'label', 'predicted')
    em.print_eval_summary(lg_eval)
    ln_eval = em.eval_matches(predictions_ln, 'label', 'predicted')
    em.print_eval_summary(ln_eval)
    nb_eval = em.eval_matches(predictions_nb, 'label', 'predicted')
    em.print_eval_summary(nb_eval)
コード例 #20
0
ファイル: createTableE.py プロジェクト: dereklh4/cs839_stage4
                             rtable=wikiData,
                             fk_ltable="ltable_ID",
                             fk_rtable="rtable_ID")
    J = em.read_csv_metadata(j_file,
                             key="_id",
                             ltable=metacriticData,
                             rtable=wikiData,
                             fk_ltable="ltable_ID",
                             fk_rtable="rtable_ID")
    print("Reading I and J from files")
print(len(I))
print(len(J))

# Generate a set of features
F = em.get_features_for_matching(metacriticData,
                                 wikiData,
                                 validate_inferred_attr_types=False)

# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I,
                            feature_table=F,
                            attrs_after='label',
                            show_progress=False)

# create learners
random_state = 0

dt = em.DTMatcher(name='DecisionTree', random_state=random_state)
rf = em.RFMatcher(name='RF', random_state=random_state)
svm = em.SVMMatcher(name='SVM', random_state=random_state)
ln = em.LinRegMatcher(name='LinReg')
コード例 #21
0
ファイル: EMPreprocess.py プロジェクト: shraga89/LFM
#                      l_output_attrs=interest_cols,
#                      r_output_attrs=interest_cols,
#                      overlap_size=5)
# K1 = ob.block_candset(K1, 'authors', 'authors', overlap_size=3)

K1 = ob.block_tables(A,
                     B,
                     'name',
                     'name',
                     l_output_attrs=interest_cols,
                     r_output_attrs=interest_cols,
                     overlap_size=1)
# K1 = ob.block_candset(K1, 'de', 'authors', overlap_size=3)

sim = em.get_sim_funs_for_matching()
features = em.get_features_for_matching(A.drop('id', axis=1),
                                        B.drop('id', axis=1))

K1.to_csv(path + 'K1.csv')
L = em.read_csv_metadata(path + 'K1.csv',
                         key='_id',
                         ltable=A,
                         rtable=B,
                         fk_ltable='ltable_id',
                         fk_rtable='rtable_id')

# L = K1.copy()
# print(L.columns)
print('Loading labels...')
L['gold'] = 0
trues = exact[exact['gold'] == 1][['ltable.id', 'rtable.id']]
L['temp'] = L['ltable_id'].astype(str) + L['rtable_id'].astype(str)
コード例 #22
0
def run_baseline_pair(input_path: str, setting_keys: List[str] = None):
    # Read settings file
    with open(f'{input_path}') as file:
        settings = json.load(file)

    for setting_key, setting_data in settings.items():
        # Only run the setting if the key is in the list of settings or no setting_keys are provided
        if setting_keys is None:
            pass
        elif setting_keys is not None and setting_key not in setting_keys:
            continue

        # Get name of settings
        settings_name = create_config_key(setting_data)
        # Get the relevant data from the settings
        model = setting_data.get("model")
        vectorization = setting_data.get("vectorization")
        dataset_size = setting_data.get("dataset_size")
        use_description = setting_data.get("use_description")
        train_langs = setting_data.get("train_lang")
        test_langs = setting_data.get("eval_lang")
        category = setting_data.get("category")

        # Create a string of the train languages
        train_langs_str = ", ".join(train_langs)

        # Process the categories separately
        dataset_p = pathlib.Path(input_path).parent.joinpath("datasets")
        if dataset_size == SMALL:
            train_data_p = dataset_p.joinpath(
                f'pairwise_train_set_{category}_{SMALL}.csv')
            test_data_p = dataset_p.joinpath(
                f'pairwise_test_set_{category}.csv')
        elif dataset_size == MEDIUM:
            train_data_p = dataset_p.joinpath(
                f'pairwise_train_set_{category}_{MEDIUM}.csv')
            test_data_p = dataset_p.joinpath(
                f'pairwise_test_set_{category}.csv')
        elif dataset_size == LARGE:
            train_data_p = dataset_p.joinpath(
                f'pairwise_train_set_{category}_{LARGE}.csv')
            test_data_p = dataset_p.joinpath(
                f'pairwise_test_set_{category}.csv')
        elif dataset_size == XLARGE:
            train_data_p = dataset_p.joinpath(
                f'pairwise_train_set_{category}_{XLARGE}.csv')
            test_data_p = dataset_p.joinpath(
                f'pairwise_test_set_{category}.csv')

        # Read the data
        train_data = pd.read_csv(train_data_p)
        test_data = pd.read_csv(test_data_p)

        # Filter the train data:
        train_data = train_data.loc[train_data["lang_1"].isin(train_langs)]
        # Prepare the train and test data for the experiments and get the mapping of the labels
        train_data, test_data = prep_data_pair(train_data, test_data,
                                               use_description)

        ## Generate features
        if vectorization == COOC:
            # Generate CooC feature
            contents = train_data['content_1'].append(train_data['content_2'])
            contents = contents.drop_duplicates()

            # Initialize Vectorizer
            cv = CountVectorizer(binary=True,
                                 analyzer='word',
                                 encoding='utf-8',
                                 max_features=5000)

            # Fit Vectorizer
            cv.fit(contents)

            # Retrieve representations & word co-occurence vectors for train set
            cv_content1_train = cv.transform(train_data['content_1']).toarray()
            cv_content2_train = cv.transform(train_data['content_2']).toarray()
            train_data_embeddings = np.multiply(cv_content1_train,
                                                cv_content2_train)

        elif vectorization == MAGELLAN:
            # Retrieve tables A,B,G
            A, B, G = prep_data_pair_mallegan(train_data, use_description)

            # Generate features automatically
            feature_table = em.get_features_for_matching(
                A, B, validate_inferred_attr_types=False)

            # Select the attrs. to be included in the feature vector table
            # Title refers to either the title or the concatenated title and description
            attrs_from_table = ['title_left', 'title_right']

            # Convert the labeled data to feature vectors using the feature table
            H = em.extract_feature_vecs(G,
                                        feature_table=feature_table,
                                        attrs_before=attrs_from_table,
                                        attrs_after='label',
                                        show_progress=False)

            # Replace NA values
            H.fillna(-1, inplace=True)

            # Select attributes which should not be used by the classifier
            attrs_to_be_excluded = []
            attrs_to_be_excluded.extend(['id', 'l_id', 'r_id',
                                         'label'])  # label
            attrs_to_be_excluded.extend(attrs_from_table)

            # Retrieve training data
            train_data_embeddings = H.drop(columns=attrs_to_be_excluded)

            # Normalize features
            normalizer = preprocessing.Normalizer().fit(train_data_embeddings)
            train_data_embeddings = normalizer.transform(train_data_embeddings)

        else:
            # Other vectorizations are not implemented
            raise AssertionError

        # Fit the models
        if model == LOGIT:
            est = LogisticRegression()
            parameters = {
                'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                'class_weight': ['balanced'],
                'max_iter': [5000],
                'n_jobs': [-2]
            }

        elif model == RAFO:
            est = RandomForestClassifier()
            parameters = {
                'n_estimators': [100],
                'max_features': ['sqrt', 'log2', None],
                'max_depth': [2, 4, 7, 10],
                'min_samples_split': [2, 5, 10, 20],
                'min_samples_leaf': [1, 2, 4, 8],
                'class_weight': ['balanced_subsample'],
                'n_jobs': [-2]
            }

        elif model == SVM:
            est = LinearSVC()
            parameters = {
                'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                'class_weight': ['balanced']
            }
        else:
            # Other models are not implemented
            raise AssertionError

        print(model)

        # Define grid search and fit model
        rs = RandomizedSearchCV(estimator=est,
                                param_distributions=parameters,
                                scoring="f1_macro",
                                cv=5,
                                n_jobs=-2,
                                verbose=1,
                                n_iter=100,
                                refit=True)

        rs.fit(train_data_embeddings, train_data["label"].astype(int))

        # Generate list for scores
        scores_per_lang = {}

        ## Run predictions
        # Run predictions for cooc feature
        if vectorization == COOC:
            for lang in test_langs:
                # Subset the test data
                test_data_lang = test_data.loc[test_data['lang_1'] == lang]

                # Retrieve representations & word co-occurence vectors for test set
                cv_content1_test = cv.transform(
                    test_data_lang['content_1']).toarray()
                cv_content2_test = cv.transform(
                    test_data_lang['content_2']).toarray()
                test_data_embeddings_lang = np.multiply(
                    cv_content1_test, cv_content2_test)

                # Prediction and computation of metrics to measure performance of model
                pred = rs.best_estimator_.predict(test_data_embeddings_lang)
                scores_per_lang[lang] = compute_metrics({
                    "labels":
                    test_data_lang["label"],
                    "predictions":
                    pred
                }).get("f1")
                output_and_store_results(setting_data, settings_name, category,
                                         train_langs_str, lang,
                                         scores_per_lang[lang], "",
                                         str(rs.best_params_), input_path,
                                         pred)

        # Run predictions for Magellan features
        elif vectorization == MAGELLAN:
            for lang in test_langs:
                # Subset the test data
                test_data_lang = test_data.loc[test_data['lang_1'] == lang]

                # Retrieve tables A,B,G
                A, B, G = prep_data_pair_mallegan(test_data_lang,
                                                  use_description)

                # Generate features
                # feature_table = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)
                H = em.extract_feature_vecs(G,
                                            feature_table=feature_table,
                                            attrs_before=attrs_from_table,
                                            attrs_after='label',
                                            show_progress=False)

                # Replace NA values
                H.fillna(-1, inplace=True)

                # Retrieve features
                test_data_embeddings_lang = H.drop(
                    columns=attrs_to_be_excluded)

                # Normalize Features
                test_data_embeddings_lang = normalizer.transform(
                    test_data_embeddings_lang)

                # Prediction and computation of metrics to measure performance of model
                pred = rs.best_estimator_.predict(test_data_embeddings_lang)
                scores_per_lang[lang] = compute_metrics({
                    "labels":
                    test_data_lang["label"],
                    "predictions":
                    pred
                }).get("f1")
                output_and_store_results(setting_data, settings_name, category,
                                         train_langs_str, lang,
                                         scores_per_lang[lang], "",
                                         str(rs.best_params_), input_path,
                                         pred)
コード例 #23
0
    # and orig_B, or if I use lz4 compression
    overlap_size=6,
    nltable_chunks=args.block_tables_chunks,
    nrtable_chunks=args.block_tables_chunks,
    # I set compute to True to see which part of the workflow was
    # taking a long time
    scheduler=client.get,
    compute=True,
    rem_stop_words=True)
logging.debug('finished ob.block_tables()')
logging.debug('len(C) = %d' % len(C))

L = pd.read_csv('./data/sample_labeled_data.csv')
logging.debug('loaded sample labeled data')

F = em.get_features_for_matching(orig_A, orig_B)
logging.debug('ran em.get_features_for_matching()')

# Convert L into feature vectors using updated F
# must use orig_A and orig_B; I get a KeyError when I try to use the sample data
# requires workers with > 1GB of memory, otherwise (sometimes) does not finish
logging.debug("starting H=extract_feature_vecs()")
H = extract_feature_vecs(
    L,
    orig_A,
    orig_B,
    '_id',
    'l_id',
    'r_id',
    'id',
    'id',