sys.path.append('/Users/pradap/Documents/Research/Python-Package/scaling/dmagellan')

from dmagellan.feature.extractfeatures import extract_feature_vecs
from dmagellan.feature.autofeaturegen import get_features_for_matching

from dask import multiprocessing, threaded
from dask.diagnostics import ProgressBar, Profiler, ResourceProfiler, CacheProfiler, visualize
import cloudpickle
filename='./profres_exp_mt_dblp_300k_extractfeatvecs.html'

pbar = ProgressBar()
pbar.register()

#print("Mem. usage before reading:{0}".format( psutil.virtual_memory().used/1e9))
A = pd.read_csv('./datasets/sample_citeseer_300k.csv')
B = pd.read_csv('./datasets/sample_dblp_300k.csv')
#print("Mem. usage after reading:{0}".format(psutil.virtual_memory().used/1e9))

C = pd.read_csv('./datasets/candset.csv')

feature_table = get_features_for_matching(A, B)

feature_vecs = extract_feature_vecs(C, A, B, '_id', 'l_id',  'r_id', 'id', 'id', feature_table=feature_table,
        nchunks=4, compute=False)

with Profiler() as prof, CacheProfiler() as cprof, ResourceProfiler(dt=0.25) as rprof:
    D = feature_vecs.compute(get=threaded.get, num_workers=4)


visualize([prof, cprof, rprof], file_path=filename, show=False)
Beispiel #2
0
                         rtable=B,
                         fk_ltable='ltable_id',
                         fk_rtable='rtable_id')
# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)
I = IJ['train']
J = IJ['test']

# Create a set of ML-matchers
dt = DTMatcher(name='DecisionTree', random_state=0)
svm = SVMMatcher(name='SVM', random_state=0)
rf = RFMatcher(name='RF', random_state=0)
lg = LogRegMatcher(name='LogReg', random_state=0)
nb = NBMatcher(name='NaiveBayes')
ln = LinRegMatcher(name='LinearRegression')
F = get_features_for_matching(A, B)
H = extract_feature_vecs(I,
                         A,
                         B,
                         '_id',
                         'ltable_id',
                         'rtable_id',
                         'id',
                         'id',
                         feature_table=F,
                         attrs_after='label',
                         nchunks=4,
                         compute=True,
                         scheduler=get)
print(len(H))