Python get_attr_corres Beispiele, py_entitymatching.get_attr_corres Python Beispiele

Beispiel #1

0

Datei anzeigen

def create_feature_vectors(C, l, r, columns):

    l = l.loc[:, columns]
    r = r.loc[:, columns]

    C['id'] = C.index
    l['a.index'] = l.index
    r['b.index'] = r.index

    setup_keys(C, l, r)

    atypes_l = em.get_attr_types(l)
    atypes_r = em.get_attr_types(r)

    for c in columns:
        if atypes_l[c] != atypes_r[c]:  # how to do this more gracefully?
            atypes_r[c] = 'str_bt_5w_10w'
            atypes_l[c] = 'str_bt_5w_10w'

    corres = em.get_attr_corres(l, r)

    tok = em.get_tokenizers_for_blocking()
    sim = em.get_sim_funs_for_blocking()

    feature_table = em.get_features(l, r, atypes_l, atypes_r, corres, tok, sim)

    # Generate features
    X = get_feature_vectors(C, feature_table, attrs_before=['matching'])

    return X

Beispiel #2

0

Datei anzeigen

Datei: debugblocker.py Projekt: anhaidgroup/py_entitymatching

def _get_field_correspondence_list(ltable, rtable, lkey, rkey, attr_corres):
    corres_list = []
    if attr_corres is None or len(attr_corres) == 0:
        corres_list = mg.get_attr_corres(ltable, rtable)['corres']
        if len(corres_list) == 0:
            raise AssertionError('Error: the field correspondence list'
                                 ' is empty. Please specify the field'
                                 ' correspondence!')
    else:
        for tu in attr_corres:
            corres_list.append(tu)

    key_pair = (lkey, rkey)
    if key_pair not in corres_list:
        corres_list.append(key_pair)

    return corres_list

Beispiel #3

0

Datei anzeigen

Datei: backup_debugblocker.py Projekt: anhaidgroup/py_entitymatching

def _get_field_correspondence_list(ltable, rtable, lkey, rkey, attr_corres):
    corres_list = []
    if attr_corres is None or len(attr_corres) == 0:
        corres_list = em.get_attr_corres(ltable, rtable)['corres']
        if len(corres_list) == 0:
            raise AssertionError('Error: the field correspondence list'
                                 ' is empty. Please specify the field'
                                 ' correspondence!')
    else:
        for tu in attr_corres:
            corres_list.append(tu)

    # If the key correspondence is not in the list, add it in.
    key_pair = (lkey, rkey)
    if key_pair not in corres_list:
        corres_list.append(key_pair)

    return corres_list

Beispiel #4

0

Datei anzeigen

def extract_features(ltable_df, rtable_df, candset_df):
    tokenizers = em.get_tokenizers_for_matching()
    sim_functions = em.get_sim_funs_for_matching()
    left_attr_types = em.get_attr_types(ltable_df)
    right_attr_types = em.get_attr_types(rtable_df)
    correspondences = em.get_attr_corres(ltable_df, rtable_df)

    feature_dict_list = []
    attribute_type_rank = {'boolean':1, 'numeric':2, 'str_eq_1w':3, 'str_bt_1w_5w':4, 'str_bt_5w_10w':5, 'str_gt_10w':6, 'un_determined':7}
    for c in correspondences['corres']:
        if left_attr_types[c[0]] != right_attr_types[c[1]]:
            if attribute_type_rank[left_attr_types[c[0]]] < attribute_type_rank[right_attr_types[c[1]]]:
                left_attr_types[c[0]] = right_attr_types[c[1]]
            else:
                right_attr_types[c[1]] = left_attr_types[c[0]]

    feature_records = get_features(ltable_df,rtable_df,left_attr_types, right_attr_types, correspondences, tokenizers, sim_functions)
    #Remove all features based on id - they are often useless
    feature_records = feature_records[feature_records.left_attribute !='id']
    feature_records.reset_index(inplace=True,drop=True)

    distance_functions = ["lev_dist", "rdf"]
    non_normalized_functions = ["aff", "sw", "swn", "nmw"]
    keep_features = [True]*feature_records.shape[0]
    for i in range(feature_records.shape[0]):
        feature = feature_records.loc[i,"feature_name"]
        for func in distance_functions + non_normalized_functions:
            if func in feature:
                keep_features[i] = False
    feature_records = feature_records.loc[keep_features,:]

    print("\n\nExtracting the full set of features:")
    candset_features_df = em.extract_feature_vecs(candset_df,feature_table=feature_records,attrs_after='gold',show_progress=True,n_jobs=-1)
    candset_features_df.fillna(value=0, inplace=True)

    return candset_features_df

Beispiel #5

0

Datei anzeigen

eval_set = train_test['test']
em.to_csv_metadata(dev_set, 'datasets/dev_set.csv')
em.to_csv_metadata(eval_set, 'datasets/eval_set.csv')

# myset = em.split_train_test(dev_set, train_proportion=0.9)
# I_set = myset['train']
# J_set = myset['test']
# em.to_csv_metadata(I_set, 'datasets/I_set.csv')
# em.to_csv_metadata(J_set, 'datasets/J_set.csv')

# creating feature for matching
match_t = em.get_tokenizers_for_matching()
match_s = em.get_sim_funs_for_matching()
atypes1 = em.get_attr_types(sampled_movies)
atypes2 = em.get_attr_types(sampled_tracks)
match_c = em.get_attr_corres(sampled_movies, sampled_tracks)
match_f = em.get_features(sampled_movies, sampled_tracks, atypes1, atypes2,
                          match_c, match_t, match_s)

# generating feature vectors
H = em.extract_feature_vecs(dev_set,
                            feature_table=match_f,
                            attrs_after='label',
                            show_progress=False)

# filling missing values in feature vectors
H.fillna(value=0, inplace=True)

# creating a set of learning-based matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)

Beispiel #6

0

Datei anzeigen

Datei: run_magellan.py Projekt: wbsg-uni-mannheim/productbert-intermediate

def run_magellan(train_set,
                 valid_set,
                 test_set,
                 feature_combinations,
                 classifiers,
                 experiment_name,
                 write_test_set_for_inspection=False):
    train_path = os.path.dirname(train_set)
    train_file = os.path.basename(train_set)
    test_path = os.path.dirname(test_set)
    test_file = os.path.basename(test_set)
    report_train_name = train_file.replace('.csv', '')
    report_test_name = test_file.replace('.csv', '')

    train_set_left = train_file.replace('pairs', 'left')
    train_set_right = train_file.replace('pairs', 'right')

    test_set_left = test_file.replace('pairs', 'left')
    test_set_right = test_file.replace('pairs', 'right')

    os.makedirs(os.path.dirname(
        '../../../reports/magellan/{}/'.format(experiment_name)),
                exist_ok=True)

    try:
        os.remove('../../../reports/magellan/{}/{}_{}.csv'.format(
            experiment_name, report_train_name, report_test_name))
    except OSError:
        pass

    with open(
            '../../../reports/magellan/{}/{}_{}.csv'.format(
                experiment_name, report_train_name, report_test_name),
            "w") as f:
        f.write(
            'feature#####model#####mean_train_score#####std_train_score#####mean_valid_score#####std_valid_score#####precision_test#####recall_test#####f1_test#####best_params#####train_time#####prediction_time#####feature_importance#####experiment_name#####train_set#####test_set\n'
        )

    for run in range(1, 4):
        for feature_combination in feature_combinations:

            A_t = em.read_csv_metadata(train_path + '/' + train_set_left,
                                       key='mag_id')
            B_t = em.read_csv_metadata(train_path + '/' + train_set_right,
                                       key='mag_id')
            # Load the pre-labeled data
            S_t = em.read_csv_metadata(train_set,
                                       key='_id',
                                       ltable=A_t,
                                       rtable=B_t,
                                       fk_ltable='ltable_mag_id',
                                       fk_rtable='rtable_mag_id')

            A_gs = em.read_csv_metadata(test_path + '/' + test_set_left,
                                        key='mag_id')
            B_gs = em.read_csv_metadata(test_path + '/' + test_set_right,
                                        key='mag_id')
            # Load the pre-labeled data
            S_gs = em.read_csv_metadata(test_set,
                                        key='_id',
                                        ltable=A_gs,
                                        rtable=B_gs,
                                        fk_ltable='ltable_mag_id',
                                        fk_rtable='rtable_mag_id')

            A_t.fillna('', inplace=True)
            A_gs.fillna('', inplace=True)

            B_t.fillna('', inplace=True)
            B_gs.fillna('', inplace=True)

            S_t.fillna('', inplace=True)
            S_gs.fillna('', inplace=True)

            ## DIRTY FIX, CLEAN UP!
            if 'name' in A_t.columns:
                A_t["price"] = A_t["price"].replace(r'^\s*$',
                                                    np.nan,
                                                    regex=True)
                A_t["price"] = A_t["price"].astype('float64')
                A_gs["price"] = A_gs["price"].replace(r'^\s*$',
                                                      np.nan,
                                                      regex=True)
                A_gs["price"] = A_gs["price"].astype('float64')
                B_t["price"] = B_t["price"].replace(r'^\s*$',
                                                    np.nan,
                                                    regex=True)
                B_t["price"] = B_t["price"].astype('float64')
                B_gs["price"] = B_gs["price"].replace(r'^\s*$',
                                                      np.nan,
                                                      regex=True)
                B_gs["price"] = B_gs["price"].astype('float64')

                S_t["ltable_price"] = S_t["ltable_price"].replace(r'^\s*$',
                                                                  np.nan,
                                                                  regex=True)
                S_t["ltable_price"] = S_t["ltable_price"].astype('float64')
                S_t["rtable_price"] = S_t["rtable_price"].replace(r'^\s*$',
                                                                  np.nan,
                                                                  regex=True)
                S_t["rtable_price"] = S_t["rtable_price"].astype('float64')

                S_gs["ltable_price"] = S_gs["ltable_price"].replace(r'^\s*$',
                                                                    np.nan,
                                                                    regex=True)
                S_gs["ltable_price"] = S_gs["ltable_price"].astype('float64')
                S_gs["rtable_price"] = S_gs["rtable_price"].replace(r'^\s*$',
                                                                    np.nan,
                                                                    regex=True)
                S_gs["rtable_price"] = S_gs["rtable_price"].astype('float64')

            atypes1 = em.get_attr_types(A_t)
            atypes2 = em.get_attr_types(B_t)

            match_c = em.get_attr_corres(A_t, B_t)

            match_c['corres'] = []

            # select attributes to compare
            for feature in feature_combination:
                match_c['corres'].append((feature, feature))

            tok = em.get_tokenizers_for_matching()
            sim = em.get_sim_funs_for_matching()

            F_t = em.get_features(A_t, B_t, atypes1, atypes2, match_c, tok,
                                  sim)

            H_t = em.extract_feature_vecs(S_t,
                                          feature_table=F_t,
                                          attrs_after=['label', 'pair_id'],
                                          show_progress=False)
            H_gs = em.extract_feature_vecs(S_gs,
                                           feature_table=F_t,
                                           attrs_after='label',
                                           show_progress=False)

            H_t = H_t.fillna(-1)
            H_gs = H_gs.fillna(-1)

            validation_ids_df = pd.read_csv(valid_set)
            val_df = H_t[H_t['pair_id'].isin(
                validation_ids_df['pair_id'].values)]
            train_only_df = H_t[~H_t['pair_id'].
                                isin(validation_ids_df['pair_id'].values)]

            train_only_df = train_only_df.drop(columns='pair_id')
            val_df = val_df.drop(columns='pair_id')

            train_only_df = train_only_df.sample(frac=1, random_state=42)

            pos_neg = H_t['label'].value_counts()
            pos_neg = round(pos_neg[0] / pos_neg[1])

            train_ind = []
            val_ind = []

            for i in range(len(train_only_df) - 1):
                train_ind.append(-1)

            for i in range(len(val_df) - 1):
                val_ind.append(0)

            ps = PredefinedSplit(test_fold=np.concatenate((train_ind,
                                                           val_ind)))

            train_df = pd.concat([train_only_df, val_df])

            for k, v in classifiers.items():

                classifier = v['clf']
                if 'random_state' in classifier.get_params().keys():
                    classifier = classifier.set_params(**{'random_state': run})

                # add pos_neg ratio to XGBoost params
                if k == 'XGBoost':
                    v['params']['scale_pos_weight']: [1, pos_neg]

                model = RandomizedSearchCV(cv=ps,
                                           estimator=classifier,
                                           param_distributions=v['params'],
                                           random_state=42,
                                           n_jobs=4,
                                           scoring='f1',
                                           n_iter=500,
                                           pre_dispatch=8,
                                           return_train_score=True)

                feats_train = train_df.drop(
                    ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1)
                labels_train = train_df['label']
                feats_gs = H_gs.drop(
                    ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1)
                labels_gs = H_gs['label']

                try:
                    model.fit(feats_train, labels_train)
                except ValueError:
                    set_trace()

                parameters = model.best_params_

                score_names = [
                    'mean_train_score', 'std_train_score', 'mean_test_score',
                    'std_test_score'
                ]
                scores = {}
                score_string = ''
                for name in score_names:
                    scores[name] = model.cv_results_[name][model.best_index_]
                    score_string = score_string + name + ': ' + str(
                        scores[name]) + ' '

                feature_names = list(feats_train.columns)

                if k == 'LogisticRegression' or k == 'LinearSVC':
                    most_important_features = model.best_estimator_.coef_
                    word_importance = zip(feature_names,
                                          most_important_features[0].tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)
                if k == 'RandomForest' or k == 'DecisionTree':
                    most_important_features = model.best_estimator_.feature_importances_
                    word_importance = zip(feature_names,
                                          most_important_features.tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)
                if k == 'NaiveBayes':
                    word_importance = ''
                if k == 'XGBoost':
                    most_important_features = model.best_estimator_.feature_importances_
                    word_importance = zip(feature_names,
                                          most_important_features.tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)

                if k == 'LogisticRegression':
                    learner = LogisticRegression(random_state=run,
                                                 solver='liblinear',
                                                 **parameters)
                elif k == 'NaiveBayes':
                    learner = GaussianNB()
                elif k == 'DecisionTree':
                    learner = DecisionTreeClassifier(random_state=run,
                                                     **parameters)
                elif k == 'LinearSVC':
                    learner = LinearSVC(random_state=run,
                                        dual=False,
                                        **parameters)
                elif k == 'RandomForest':
                    learner = RandomForestClassifier(random_state=run,
                                                     n_jobs=4,
                                                     **parameters)
                elif k == 'XGBoost':
                    learner = xgb.XGBClassifier(random_state=run,
                                                n_jobs=4,
                                                **parameters)
                else:
                    print('Learner is not a valid option')
                    break

                model = learner
                feats_train = train_only_df.sample(frac=1, random_state=42)
                feats_train = train_only_df.drop(
                    ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1)
                labels_train = train_only_df['label']

                start = time.time()
                model.fit(feats_train, labels_train)
                end = time.time()

                train_time = end - start

                start = time.time()
                preds_gs = model.predict(feats_gs)

                end = time.time()

                pred_time = end - start

                gs_report = classification_report(labels_gs,
                                                  preds_gs,
                                                  output_dict=True)

                feature_report = '+'.join(feature_combination)

                if write_test_set_for_inspection:

                    out_path = '../../../data/processed/wdc-lspc/inspection/{}/magellan/'.format(
                        experiment_name)
                    os.makedirs(os.path.dirname(out_path), exist_ok=True)

                    file_name = '_'.join([
                        os.path.basename(train_set),
                        os.path.basename(test_set), k, feature_report
                    ])
                    file_name = file_name.replace('.csv', '')
                    file_name += f'_{run}.pkl.gz'

                    test_inspection_df = S_gs.copy()
                    if k == 'LinearSVC':
                        proba_gs = model.decision_function(feats_gs).tolist()
                    else:
                        proba_gs = model.predict_proba(feats_gs).tolist()
                    test_inspection_df['pred'] = preds_gs
                    test_inspection_df['Class Prob'] = proba_gs
                    test_inspection_df.to_pickle(out_path + file_name,
                                                 compression='gzip')

                with open(
                        '../../../reports/magellan/{}/{}_{}.csv'.format(
                            experiment_name, report_train_name,
                            report_test_name), "a") as f:
                    f.write(feature_report + '#####' + k + '#####' +
                            str(scores['mean_train_score']) + '#####' +
                            str(scores['std_train_score']) + '#####' +
                            str(scores['mean_test_score']) + '#####' +
                            str(scores['std_test_score']) + '#####' +
                            str(gs_report['1']['precision']) + '#####' +
                            str(gs_report['1']['recall']) + '#####' +
                            str(gs_report['1']['f1-score']) + '#####' +
                            str(parameters) + '#####' + str(train_time) +
                            '#####' + str(pred_time) + '#####' +
                            str(word_importance[0:100]) + '#####' +
                            experiment_name + '#####' + report_train_name +
                            '#####' + report_test_name + '\n')

Beispiel #7

0

Datei anzeigen

Datei: block.py Projekt: wkfunk/838_project

print len(candidate_pairs)

candidate_pairs = ob.block_candset(candidate_pairs,
                                   'artist',
                                   'artist',
                                   word_level=True,
                                   overlap_size=1,
                                   show_progress=True)

print len(candidate_pairs)

#em.to_csv_metadata(reduced_pairs,'C:/Users/Daniel/Documents/UW/838/Project/Stage3/data/pairs_after_ob_title_and_artist.csv')

block_f = em.get_features_for_blocking(songs, tracks)
block_c = em.get_attr_corres(songs, tracks)
block_t = em.get_tokenizers_for_blocking()
block_s = em.get_sim_funs_for_blocking()

atypes1 = em.get_attr_types(songs)
atypes2 = em.get_attr_types(tracks)

block_f = em.get_features(songs, tracks, atypes1, atypes2, block_c, block_t,
                          block_s)

rb = em.RuleBasedBlocker()
rb.add_rule(["name_name_jac_dlm_dc0_dlm_dc0(ltuple, rtuple) < 0.3"], block_f)

candidate_pairs = rb.block_candset(candidate_pairs, show_progress=True)

print len(candidate_pairs)

Beispiel #8

0

Datei anzeigen

Datei: matcher.py Projekt: Karan6Dharni/Data-Science-Project

dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='Naive Bayes')

#**********************Creating Features *******************************************************

#************* Change feature datatype*****************
a_types = em.get_attr_types(A)
b_types = em.get_attr_types(B)
b_types['Name'] = a_types['Name']

match_c = em.get_attr_corres(A, B)
match_t = em.get_tokenizers_for_blocking()
match_s = em.get_sim_funs_for_blocking()

F = em.get_features(
    A,
    B,
    a_types,
    b_types,
    match_c,
    match_t,
    match_s,
)

#***********************          Drop Attributes: ISBN_10 and ISBN_13          *****************************************************

Beispiel #9

0

Datei anzeigen

Datei: matching.py Projekt: meghu2791/EntityMatching

G = em.read_csv_metadata("../DATA/Labelled Set G.csv",
                         key="_id",
                         fk_ltable="ltable_ID",
                         fk_rtable="rtable_ID",
                         ltable=A,
                         rtable=B)

# Split G into I an J
IJ = em.split_train_test(G, train_proportion=0.66, random_state=29)
I = IJ['train']
J = IJ['test']

atypes1 = em.get_attr_types(A)
atypes2 = em.get_attr_types(B)

block_c = em.get_attr_corres(A, B)
block_c['corres'] = [('title', 'title'), ('author', 'author')]

tok = em.get_tokenizers_for_blocking()
sim = em.get_sim_funs_for_blocking()

feature_table = em.get_features(A, B, atypes1, atypes2, block_c, tok, sim)

# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I,
                            feature_table=feature_table,
                            attrs_after='label',
                            show_progress=False)
# Impute feature vectors with the mean of the column values.
H = em.impute_table(H,
                    exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'],

Beispiel #10

0

Datei anzeigen

Datei: Tutorial_py2.py Projekt: shimakaze-git/workflows

train_data = split['train']
test_data = split['test']

dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')


# Before we can apply any machine learning technique, we need to extract a set of features. Fortunately, the **py_entitymatching** package can automatically extract a set of features once we specify which columns in the two datasets correspond to each other. The following code snippet starts by specifying the correspondence between the column of the two datasets. Then, it uses the **py_entitymatching** package to determine the type of each column. By considering the types of columns in each dataset (stored in variables *l_attr_types* and *r_attr_types*), and using the tokenizers and similarity functions suggested by the package, we can extract a set of instructions for extracting features. Note that variable **F** is not the set of extracted features, rather it encodes the instructions for computing the features.

# In[27]:

attr_corres = em.get_attr_corres(kaggle_data, imdb_data)
attr_corres['corres'] = [('norm_movie_title', 'norm_title'), 
                         ('norm_title_year', 'norm_year'),
                        ('content_rating', 'mpaa'),
                         ('budget', 'budget'),
]

l_attr_types = em.get_attr_types(kaggle_data)
r_attr_types = em.get_attr_types(imdb_data)

tok = em.get_tokenizers_for_matching()
sim = em.get_sim_funs_for_matching()

F = em.get_features(kaggle_data, imdb_data, l_attr_types, r_attr_types, attr_corres, tok, sim)