def setUp(self): data_file = os.path.abspath( os.path.dirname(__file__) + os.path.sep + "../../") + '/example/yelp_3000.pkl' self.localdata = LocalData(data_file, 'pkl', "row['business_id']", ["row['name']"], ["row['name']", "row['full_address']"])
def test_sota_sampler(self): local_file = os.path.abspath(os.path.dirname(__file__) + os.path.sep + "../../") + '/pkl_example/yelp_3000.pkl' localdata = LocalData(local_file, 'pkl', "business_id", ["name"], ["name", "full_address"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) sampler.sota_sampler(query_pool=initQueries, api=self.yelp, match_term=localdata.getQueryList(), top_k=300, adjustment=1, samplenum=1) self.yelp.getSession().close() assert True
def setUp(self): search_term = 'q' parameters = {'h': 1000} self.dblp = PublApi(delay=5, search_term=search_term, **parameters) localdata_file = os.path.abspath( os.path.dirname(__file__) + os.path.sep + "../../") + '/example/dblp_10000.pkl' localdata = LocalData(localdata_file, 'pkl', "row['key']", ["row['title']"], ["row['title']"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) self.initQueries = initQueries
class LocaldataTestCase(unittest.TestCase): def setUp(self): data_file = os.path.abspath(os.path.dirname(__file__) + os.path.sep + "../../") + '/yelp_example/yelp_3000_AZ.csv' self.localdata = LocalData(data_file, 'csv', "business_id", ["name"], ["name", "full_address"]) def tearDown(self): self.localdata = None def test_loadLocalData(self): self.localdata.setlocalData(None, None, None) self.localdata.read_csv() localdata_ids, localdata_query, localdata_er = self.localdata.getlocalData() assert len(localdata_ids) == 3000
def setUp(self): search_term = 'q' parameters = {'h': 1000} self.dblp = PublApi(top_k=1000, delay=5, search_term=search_term, **parameters) localdata_file = os.path.abspath( os.path.dirname(__file__) + os.path.sep + "../../") + '/csv_example/dblp_sample.csv' localdata = LocalData(localdata_file, 'csv', "key", ["title"], ["title"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) self.initQueries = initQueries
class LocaldataTestCase(unittest.TestCase): def setUp(self): data_file = os.path.abspath( os.path.dirname(__file__) + os.path.sep + "../../") + '/example/yelp_3000.pkl' self.localdata = LocalData(data_file, 'pkl', "row['business_id']", ["row['name']"], ["row['name']", "row['full_address']"]) def tearDown(self): self.localdata = None def test_loadLocalData(self): self.localdata.setlocalData(None, None, None) self.localdata.read_pickle() localdata_ids, localdata_query, localdata_er = self.localdata.getlocalData( ) assert len(localdata_ids) == 3000
result_file.csv match_file.pkl match_file.csv enriched_file.pkl enriched_file.csv """ sample_file = 'dblp_sample.csv' localdata_file = 'dblp_3881.csv' result_dir = 'dblp_result' sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='csv', uniqueid="key", querylist=["title"]) localdata = LocalData(localpath=localdata_file, filetype='csv', uniqueid="ID", querylist=['title'], matchlist=['title']) hiddendata = HiddenData(result_dir=result_dir, uniqueid="info.key", matchlist=["info.title"]) budget = 20 smartcrawl.smartCrawl(budget, dblp, sampledata, localdata, hiddendata) """ pool_thre = 2 jaccard_thre = 0.85 threads = 4 smartcrawl.smartCrawl(budget, dblp, sampledata, localdata, hiddendata, pool_thre, jaccard_thre, threads) """
yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5, search_term=search_term, **parameters) """ \yelp_sample_AZ.pkl yelp_3000_AZ.csv yelp_result\\result_file.pkl result_file.csv match_file.pkl match_file.csv enriched_file.pkl enriched_file.csv """ sample_file = 'yelp_sample_AZ.pkl' localdata_file = 'yelp_3000_AZ.csv' result_dir = 'yelp_result' sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='pkl', uniqueid="business_id", querylist=["name"]) localdata = LocalData(localpath=localdata_file, filetype='csv', uniqueid="business_id", querylist=["name"], matchlist=["name", "full_address"]) hiddendata = HiddenData(result_dir=result_dir, uniqueid="id", matchlist=["name", "location.display_address.*"]) budget = 20 smartcrawl.smartCrawl(budget, yelp, sampledata, localdata, hiddendata) """ pool_thre = 2 jaccard_thre = 0.85 threads = 4 smartcrawl.smartCrawl(budget, yelp, sampledata, localdata, hiddendata, pool_thre, jaccard_thre, threads) """
def setUp(self): data_file = os.path.abspath(os.path.dirname(__file__) + os.path.sep + "../../") + '/yelp_example/yelp_3000_AZ.csv' self.localdata = LocalData(data_file, 'csv', "business_id", ["name"], ["name", "full_address"])
from deeperlib.api.yelp.searchapi import SearchApi from deeperlib.core import utils from deeperlib.data_processing.local_data import LocalData from deeperlib.estimator import sampler # ==== Sota-Sampler Yelp ==== client_id = "kCe2YbZePXsPnC204ZrXoQ" client_secret = "s9KnvEEQW7jaA2wlrBi4X2fnDQ0F7asdklXVvJUidWp8i50ov24E8EjkHX2AUhoL" search_term = 'term' parameters = {'limit': 50, 'location': 'AZ'} yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5, search_term=search_term, **parameters) local_file = 'yelp_3000.pkl' localdata = LocalData(local_file, 'pkl', "row['business_id']", ["row['name']"], ["row['name']", "row['full_address']"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) sampler.sota_sampler(query_pool=initQueries, api=yelp, match_term=localdata.getQueryList(), top_k=300, adjustment=1) yelp.getSession().close()
from deeperlib.api.yelp.searchapi import SearchApi from deeperlib.core import utils from deeperlib.data_processing.local_data import LocalData from deeperlib.estimator import sampler # ==== Sota-Sampler Yelp ==== client_id = "kCe2YbZePXsPnC204ZrXoQ" client_secret = "s9KnvEEQW7jaA2wlrBi4X2fnDQ0F7asdklXVvJUidWp8i50ov24E8EjkHX2AUhoL" search_term = 'term' parameters = {'limit': 50, 'location': 'AZ'} yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5, search_term=search_term, **parameters) local_file = 'yelp_3000.pkl' localdata = LocalData(local_file, 'pkl', "business_id", ["name"], ["name", "full_address"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) sampler.sota_sampler(query_pool=initQueries, api=yelp, match_term=localdata.getQueryList(), top_k=300, adjustment=1) yelp.getSession().close()
from deeperlib.api.dblp.publapi import PublApi from deeperlib.core import utils from deeperlib.data_processing.local_data import LocalData from deeperlib.estimator import aggregation # ==== Sota-Estimator Dblp ==== search_term = 'q' parameters = {'h': 1000} dblp = PublApi(delay=5, search_term=search_term, **parameters) localdata_file = 'dblp_10000' localdata = LocalData(localdata_file, 'pkl', "row['key']", ["row['title']"], ["row['title']"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) aggregation.sota_estimator(query_pool=initQueries, api=dblp, match_term=["row['info']['title']"], uniqueid="row['info']['key']", query_num=1) # ==== Stratified-Estimator Dblp ==== dblp = PublApi(delay=5, search_term=search_term, **parameters) aggregation.stratified_estimator(query_pool=initQueries, api=dblp, match_term=["row['info']['title']"], candidate_rate=0.2, query_num=100) dblp.getSession().close()
from deeperlib.api.dblp.publapi import PublApi from deeperlib.core import utils from deeperlib.data_processing.local_data import LocalData from deeperlib.estimator import aggregation # ==== Sota-Estimator Dblp ==== search_term = 'q' parameters = {'h': 1000} dblp = PublApi(top_k=1000, delay=5, search_term=search_term, **parameters) localdata_file = 'dblp_sample.csv' localdata = LocalData(localdata_file, 'csv', "key", ["title"], ["title"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) aggregation.sota_estimator(query_pool=initQueries, api=dblp, match_term=["info.title"], uniqueid="info.key", query_num=1) # ==== Stratified-Estimator Dblp ==== aggregation.stratified_estimator(query_pool=initQueries, api=dblp, match_term=["info.title"], candidate_rate=0.2, query_num=100) dblp.getSession().close()
format The result format of the search. Recognized values are "xml", xml ...?q=test&format=json "json", and "jsonp". h Maximum number of search results (hits) to return. For bandwidth 30 ...?q=test&h=100 reasons, this number is capped at 1000. f The first hit in the numbered sequence of search results (starting with 0) to return. In combination with the h parameter, 0 ...?q=test&h=100&f=300 this parameter can be used for pagination of search results. c Maximum number of completion terms (see below) to return. For 10 ...?q=test&c=0 bandwidth reasons, this number is capped at 1000. """ search_term = 'q' parameters = {'h': 1000} dblp = PublApi(delay=5, search_term=search_term, **parameters) """ \dblp_sample.pkl dblp_10000.pkl dblp_result\\result_file.pkl result_file.csv match_file.pkl match_file.csv """ sample_file = 'dblp_sample.pkl' localdata_file = 'dblp_10000.pkl' result_dir = 'dblp_result' sampledata = SampleData(samplepath=sample_file, filetype='pkl', uniqueid="row['key']", querylist=["row['title']"]) localdata = LocalData(localpath=localdata_file, filetype='pkl', uniqueid="row['key']", querylist=["row['title']"], matchlist=["row['title']"]) hiddendata = HiddenData(result_dir=result_dir, uniqueid="row['info']['key']", matchlist=["row['info']['title']"]) smartcrawl.smartCrawl(top_k, count, pool_thre, jaccard_thre, threads, budget, dblp, sampledata, localdata, hiddendata)
yelp_result\\result_file.pkl result_file.csv match_file.pkl match_file.csv """ sample_file = 'yelp_sample.pkl' localdata_file = 'yelp_3000.pkl' result_dir = 'yelp_result' sampledata = SampleData(samplepath=sample_file, filetype='pkl', uniqueid="row['id']", querylist=["row['name']"]) """ localdata = LocalData(localpath=localdata_file, filetype='pkl', uniqueid="row['business_id']", querylist=["row['name']"], matchlist=["row['name']", "row['full_address']"]) hiddendata = HiddenData(result_dir=result_dir, uniqueid="row['id']", matchlist=["row['name']", "' '.join(row['location']['display_address'])"]) """ localdata = LocalData(localpath='yelp_5882.csv', filetype='csv', uniqueid='ID', querylist=['NAME'], matchlist=['NAME', 'ADDRESS']) hiddendata = HiddenData( result_dir=result_dir, uniqueid="row['id']", matchlist=["row['name']", "' '.join(row['location']['display_address'])"]) smartcrawl.smartCrawl(top_k, count, pool_thre, jaccard_thre, threads, budget, yelp, sampledata, localdata, hiddendata)