Beispiel #1
0
def fit_deepmatcher(model_args, train, validation, test, batch_size = 16):
    '''
    Takes train-validation-test sets generated by dm.data.process() using temp written csv
    files created by perpare_data_deepmatcher

    Outputs:
        (train_predictions, valid_predictions, test_predictions, None, post_blocked_all_sets_labels)

        * The none is to keep it in the desired format for evaluation_functions.py
    '''
    # Confugre the Matching Algorithm 
    model = dm.MatchingModel(attr_summarizer=model_args["attr_summarizer"])
    # Fit the model and select best epoch version based on best validation accuracy
    model.run_train(
        train,
        validation,
        epochs=10,
        batch_size= batch_size,
        best_save_path='../results/' + model_args["attr_summarizer"] + '.pth',
        pos_neg_ratio=2)
    # Create and store predictions
    ## Name of the model is the attr summarizer setting
    train_predictions = {model_args["attr_summarizer"]:model.run_prediction(train).match_score.values}
    valid_predictions = {model_args["attr_summarizer"]:model.run_prediction(validation).match_score.values}
    test_predictions = {model_args["attr_summarizer"]:model.run_prediction(test).match_score.values}


    # Create source of truth to be used for evaluation_functions.py
    post_blocked_all_sets_labels = {"train":train.get_raw_table().y.values,
                        "valid":validation.get_raw_table().y.values, 
                        "test":test.get_raw_table().y.values}
    
    return (train_predictions, valid_predictions, test_predictions, None, post_blocked_all_sets_labels)
def predict_and_write_for_inspection(test_path, model_path, experiment_name,
                                     gpu_id, nn_type, comp_type, features):

    out_path = '../../../data/processed/inspection/{}/deepmatcher/'.format(
        experiment_name)
    os.makedirs(os.path.dirname(out_path), exist_ok=True)

    ignore_columns = get_features(test_path)
    left_right_features = ['ltable_' + feat for feat in features]
    left_right_features.extend(['rtable_' + feat for feat in features])
    for feat in left_right_features:
        ignore_columns.remove(feat)

    os.environ["CUDA_VISIBLE_DEVICES"] = gpu_id

    model = dm.MatchingModel(attr_summarizer=nn_type,
                             attr_comparator=comp_type)
    model.load_state(model_path)

    candidate = dm.data.process_unlabeled(path=test_path,
                                          trained_model=model,
                                          ignore_columns=ignore_columns)

    predictions = model.run_prediction(candidate,
                                       output_attributes=True,
                                       batch_size=8)

    predictions['pred'] = predictions['match_score'].apply(
        lambda score: 1 if score >= 0.5 else 0)

    print(
        classification_report(predictions['label'],
                              predictions['pred'],
                              digits=4))
    print(confusion_matrix(predictions['label'], predictions['pred']))

    file_name = os.path.basename(model_path) + os.path.basename(test_path)
    file_name = file_name.replace('model.pth', '_')
    file_name = file_name.replace('.csv', '.csv.gz')
    file_name = file_name.replace('_formatted', '')

    predictions.to_csv(out_path + file_name,
                       compression='gzip',
                       header=True,
                       index=False)
Beispiel #3
0
def dm_train(df):

    split_path = config.PATHS["deepmatcher_training_folder"]

    # Split labeled data into train, valid, and test csv files to disk, with the split ratio of 3:1:1.
    dm.data.split(df, split_path, 'train.csv', 'valid.csv', 'test.csv',\
                  [3, 1, 1])

    train, validation, test = dm.data.process(
        path=split_path,
        cache='train_cache.pth',
        train='train.csv',
        validation='valid.csv',
        test='test.csv',
        left_prefix = 'ltable',
        right_prefix = 'rtable',
        label_attr='label',
        id_attr = '_id',
        ignore_columns=('ltable_id', 'rtable_id'))

    #Create a hybrid model.
    model = dm.MatchingModel(attr_summarizer='hybrid')

    # Train the hybrid model with 3 training epochs, batch size of 16, positive-to-negative
    # ratio to be 10. We save the best model (with the
    # highest F1 score on the validation set) to 'hybrid_model.pth'.
    model.run_train(
        train,
        validation,
        epochs=3,
        batch_size=16,
        best_save_path=config.PATHS['deepmatcher_model'],
        pos_neg_ratio=10)


    # Evaluate the accuracy on the test data.
    print (model.run_eval(test))
    return model
Beispiel #4
0
                        cache=None,
                        #check_cached_data=False,
                        embeddings='fasttext.wiki.vec',
                        embeddings_cache_path=embedding_cache_dir,
                        train='train.csv', validation='validation.csv', test='test.csv')

    # parameters to keep consistent with
    nn_type = 'hybrid'
    comp_type = 'abs-diff'
    #epochs = 15
    pos_neg_ratio = 1
    batch_size = 8
    lr = 0.001
    lr_decay = 0.9
    smoothing=0.05
    model = dm.MatchingModel(attr_summarizer=nn_type, attr_comparator=comp_type)
    model.initialize(train)
    optim = dm.optim.Optimizer(method='adam', lr=lr, max_grad_norm=5, start_decay_at=1, beta1=0.9, beta2=0.999, adagrad_accum=0.0, lr_decay=lr_decay)
    optim.set_parameters(model.named_parameters())
    start = time.time()

    if len(sys.argv)>4:
        ep=int(sys.argv[5])
    else:
        ep=30
    model.run_train(
         train,
         validation,
         #epochs=epochs,
         batch_size=batch_size,
         pos_neg_ratio=pos_neg_ratio,
Beispiel #5
0
def run_dm_model(train_set,
                 valid_set,
                 test_set,
                 experiment_name,
                 gpu_id,
                 epochs,
                 pos_neg_ratio,
                 batch_size,
                 lr,
                 lr_decay,
                 embedding,
                 nn_type,
                 comp_type,
                 special_name,
                 features,
                 run_no,
                 smoothing=0.05):

    os.makedirs(os.path.dirname(
        '../../../reports/deepmatcher/raw/{}/'.format(experiment_name)),
                exist_ok=True)
    os.makedirs(os.path.dirname(
        '../../../cache/deepmatcher/{}/data-cache/'.format(experiment_name)),
                exist_ok=True)
    os.makedirs(os.path.dirname(
        '../../../cache/deepmatcher/{}/models/'.format(experiment_name)),
                exist_ok=True)

    os.environ["CUDA_VISIBLE_DEVICES"] = gpu_id

    dm.data.reset_vector_cache()

    ignore_columns = get_features(train_set)
    left_right_features = ['ltable_' + feat for feat in features]
    left_right_features.extend(['rtable_' + feat for feat in features])
    for feat in left_right_features:
        ignore_columns.remove(feat)

    features_filename = '-'.join(features)
    train_set_filename = os.path.basename(train_set)
    train_set_filename = train_set_filename.replace('.csv', '')

    train, valid, test = dm.data.process(
        path='',
        cache='../../../cache/deepmatcher/{}/data-cache/{}.pth'.format(
            experiment_name, train_set_filename + '_' + embedding),
        train=train_set,
        validation=valid_set,
        test=test_set,
        embeddings=embedding,
        use_magellan_convention=True,
        ignore_columns=ignore_columns)

    old_stdout = sys.stdout

    sys.stdout = open(
        '../../../reports/deepmatcher/raw/{}/{}_{}_{}_epochs{}_ratio{}_batch{}_lr{}_lrdecay{}_{}_{}_{}_run{}.txt'
        .format(experiment_name, nn_type, comp_type, special_name, epochs,
                pos_neg_ratio, batch_size, lr, lr_decay, embedding,
                features_filename, train_set_filename, run_no), 'w')
    model = dm.MatchingModel(attr_summarizer=nn_type,
                             attr_comparator=comp_type)
    model.initialize(train)

    optim = dm.optim.Optimizer(method='adam',
                               lr=lr,
                               max_grad_norm=5,
                               start_decay_at=1,
                               beta1=0.9,
                               beta2=0.999,
                               adagrad_accum=0.0,
                               lr_decay=lr_decay)
    optim.set_parameters(model.named_parameters())

    start = time.time()
    model.run_train(
        train,
        valid,
        epochs=epochs,
        batch_size=batch_size,
        best_save_path=
        '../../../cache/deepmatcher/{}/models/{}_{}_{}_epochs{}_ratio{}_batch{}_lr{}_lrdecay{}_{}_{}_{}_run{}_model.pth'
        .format(experiment_name, nn_type, comp_type, special_name, epochs,
                pos_neg_ratio, batch_size, lr, lr_decay, embedding,
                features_filename, train_set_filename, run_no),
        pos_neg_ratio=pos_neg_ratio,
        optimizer=optim,
        label_smoothing=smoothing)
    end = time.time()
    print('Training time: ' + str(end - start))
    start = time.time()
    model.run_eval(test, batch_size=batch_size)
    end = time.time()
    print('Prediction time: ' + str(end - start))
    sys.stdout = old_stdout
Beispiel #6
0
    if len(neighborhood) > num_triangles:
        neighborhood = neighborhood.sample(n=num_triangles)
    neighborhood['id'] = neighborhood.index
    neighborhood['label'] = list(
        map(lambda predictions: int(round(predictions)),
            neighborhood.match_score.values))
    neighborhood = neighborhood.drop(['match_score'], axis=1)
    r1r2['label'] = np.argmax(originalPrediction)
    dataset4explanation = pd.concat([r1r2, neighborhood], ignore_index=True)
    return dataset4explanation


lsource = pd.read_csv('datasets/Structured/DBLP-ACM/tableA.csv')
rsource = pd.read_csv('datasets/Structured/DBLP-ACM/tableB.csv')

model = dm.MatchingModel(attr_summarizer='hybrid')
model.load_state('da_dm.pth')


def predict_fn(test_df,
               model,
               ignore_columns=['label'],
               outputAttributes=False,
               batch_size=32):
    data = test_df.copy().drop(
        [c for c in ignore_columns if c in test_df.columns], axis=1)
    if not ('id' in data.columns):
        data['id'] = np.arange(len(data))
    tmp_name = "./{}.csv".format("".join(
        [random.choice(string.ascii_lowercase) for _ in range(10)]))
    data.to_csv(tmp_name, index=False)
Beispiel #7
0
import numpy as np
np.random.seed(42)
import random
random.seed(42)

if __name__ == "__main__":
    data_dir = "/home/zz/Work/data/deepmatcher_toy/sample_data/itunes-amazon"

    train, validation, test = \
        dm.data.process(path=data_dir,
                        check_cached_data=False,
                        embeddings='fasttext.wiki.vec',
                        embeddings_cache_path=data_dir+"/embedding_cache",
                        train='train.csv', validation='validation.csv', test='test.csv')

    model = dm.MatchingModel()
    model.run_train(train, validation, best_save_path=None)
    model.run_eval(test)

    # unlabeled = dm.data.process_unlabeled(path='data_directory/unlabeled.csv', trained_model=model)
    # model.run_prediction(unlabeled)
'''
    This method is important for reading/caching embeddings

    The param 'embeddings' is only a name for identifying the embeddings. DM only support a limited set of
    these names. And when a recognisable name is supplied, it will attempt to download it. This means if you want to use
    a custom embedding, you need to 'hack it' by renaming your model using one of the expected names, and keep the
    same format. As an example, when embedding=fasttext.wiki.vec, DM will look for: 
    https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip 

    You can pre-download this and save it into: [embeddings_cache_path] (see the code below)
Beispiel #8
0
def create_model():
    model = dm.MatchingModel(attr_summarizer='hybrid')
    return model
def run_dm_model(train_set,
                 valid_set,
                 test_set,
                 experiment_name,
                 gpu_id,
                 epochs,
                 pos_neg_ratio,
                 batch_size,
                 lr,
                 lr_decay,
                 embedding,
                 nn_type,
                 comp_type,
                 special_name,
                 features,
                 run_no,
                 smoothing=0.05,
                 prediction_sets=None):

    os.makedirs(os.path.dirname(
        '../../../reports/deepmatcher/raw/{}/'.format(experiment_name)),
                exist_ok=True)
    os.makedirs(os.path.dirname(
        '../../../cache/deepmatcher/{}/data-cache/'.format(experiment_name)),
                exist_ok=True)
    os.makedirs(os.path.dirname(
        '../../../cache/deepmatcher/{}/models/'.format(experiment_name)),
                exist_ok=True)

    os.environ["CUDA_VISIBLE_DEVICES"] = gpu_id

    dm.data.reset_vector_cache()

    ignore_columns = get_features(train_set)
    left_right_features = ['ltable_' + feat for feat in features]
    left_right_features.extend(['rtable_' + feat for feat in features])
    for feat in left_right_features:
        ignore_columns.remove(feat)

    features_filename = '-'.join(features)
    train_set_filename = os.path.basename(train_set)
    train_set_filename = train_set_filename.replace('.csv', '')
    train, valid, test = dm.data.process(
        path='',
        cache='../../../cache/deepmatcher/{}/data-cache/{}.pth'.format(
            experiment_name, train_set_filename + '_' + embedding),
        train=train_set,
        validation=valid_set,
        test=test_set,
        embeddings=embedding,
        use_magellan_convention=True,
        ignore_columns=ignore_columns)

    old_stdout = sys.stdout

    sys.stdout = open(
        '../../../reports/deepmatcher/raw/{}/{}_{}_{}_epochs{}_ratio{}_batch{}_lr{}_lrdecay{}_{}_{}_{}_run{}.txt'
        .format(experiment_name, nn_type, comp_type, special_name, epochs,
                pos_neg_ratio, batch_size, lr, lr_decay, embedding,
                features_filename, train_set_filename, run_no), 'w')
    model = dm.MatchingModel(attr_summarizer=nn_type,
                             attr_comparator=comp_type)
    model.initialize(train)

    optim = dm.optim.Optimizer(method='adam',
                               lr=lr,
                               max_grad_norm=5,
                               start_decay_at=1,
                               beta1=0.9,
                               beta2=0.999,
                               adagrad_accum=0.0,
                               lr_decay=lr_decay)
    optim.set_parameters(model.named_parameters())

    start = time.time()
    model.run_train(
        train,
        valid,
        epochs=epochs,
        batch_size=batch_size,
        best_save_path=
        '../../../cache/deepmatcher/{}/models/{}_{}_{}_epochs{}_ratio{}_batch{}_lr{}_lrdecay{}_{}_{}_{}_run{}_model.pth'
        .format(experiment_name, nn_type, comp_type, special_name, epochs,
                pos_neg_ratio, batch_size, lr, lr_decay, embedding,
                features_filename, train_set_filename, run_no),
        pos_neg_ratio=pos_neg_ratio,
        optimizer=optim,
        label_smoothing=smoothing)
    end = time.time()
    print('Training time: ' + str(end - start))
    start = time.time()
    model.run_eval(test, batch_size=batch_size)
    end = time.time()
    print('Prediction time: ' + str(end - start))
    sys.stdout = old_stdout

    if prediction_sets is not None:
        out_path = '../../../data/processed/inspection/{}/deepmatcher/'.format(
            experiment_name)
        os.makedirs(os.path.dirname(out_path), exist_ok=True)
        for prediction_set in prediction_sets:
            candidate = dm.data.process_unlabeled(
                path=prediction_set,
                trained_model=model,
                ignore_columns=ignore_columns)
            predictions = model.run_prediction(candidate,
                                               output_attributes=True,
                                               batch_size=8)

            predictions['label_pred'] = predictions['match_score'].apply(
                lambda score: 1 if score >= 0.5 else 0)

            file_name = os.path.basename(
                '{}_{}_{}_epochs{}_ratio{}_batch{}_lr{}_lrdecay{}_{}_{}_{}_run{}_model.pth'
                .format(nn_type, comp_type, special_name, epochs,
                        pos_neg_ratio, batch_size, lr, lr_decay, embedding,
                        features_filename, train_set_filename,
                        run_no)) + os.path.basename(prediction_set)
            file_name = file_name.replace('.csv', '.csv.gz')
            file_name = file_name.replace('model.pth', '')
            file_name = file_name.replace('_formatted', '')

            predictions.to_csv(out_path + file_name,
                               compression='gzip',
                               header=True,
                               index=False)