Beispiel #1
0
 def setUp(self):
     data_file = os.path.abspath(
         os.path.dirname(__file__) + os.path.sep +
         "../../") + '/example/yelp_3000.pkl'
     self.localdata = LocalData(data_file, 'pkl', "row['business_id']",
                                ["row['name']"],
                                ["row['name']", "row['full_address']"])
Beispiel #2
0
 def test_sota_sampler(self):
     local_file = os.path.abspath(os.path.dirname(__file__) + os.path.sep + "../../") + '/pkl_example/yelp_3000.pkl'
     localdata = LocalData(local_file, 'pkl', "business_id", ["name"], ["name", "full_address"])
     localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
     initQueries = utils.queryGene(localdata_query, 2)
     sampler.sota_sampler(query_pool=initQueries, api=self.yelp, match_term=localdata.getQueryList(), top_k=300,
                          adjustment=1, samplenum=1)
     self.yelp.getSession().close()
     assert True
Beispiel #3
0
 def setUp(self):
     search_term = 'q'
     parameters = {'h': 1000}
     self.dblp = PublApi(delay=5, search_term=search_term, **parameters)
     localdata_file = os.path.abspath(
         os.path.dirname(__file__) + os.path.sep +
         "../../") + '/example/dblp_10000.pkl'
     localdata = LocalData(localdata_file, 'pkl', "row['key']",
                           ["row['title']"], ["row['title']"])
     localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
     initQueries = utils.queryGene(localdata_query, 2)
     self.initQueries = initQueries
Beispiel #4
0
class LocaldataTestCase(unittest.TestCase):
    def setUp(self):
        data_file = os.path.abspath(os.path.dirname(__file__) + os.path.sep + "../../") + '/yelp_example/yelp_3000_AZ.csv'
        self.localdata = LocalData(data_file, 'csv', "business_id", ["name"],
                                   ["name", "full_address"])

    def tearDown(self):
        self.localdata = None

    def test_loadLocalData(self):
        self.localdata.setlocalData(None, None, None)
        self.localdata.read_csv()
        localdata_ids, localdata_query, localdata_er = self.localdata.getlocalData()
        assert len(localdata_ids) == 3000
Beispiel #5
0
 def setUp(self):
     search_term = 'q'
     parameters = {'h': 1000}
     self.dblp = PublApi(top_k=1000,
                         delay=5,
                         search_term=search_term,
                         **parameters)
     localdata_file = os.path.abspath(
         os.path.dirname(__file__) + os.path.sep +
         "../../") + '/csv_example/dblp_sample.csv'
     localdata = LocalData(localdata_file, 'csv', "key", ["title"],
                           ["title"])
     localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
     initQueries = utils.queryGene(localdata_query, 2)
     self.initQueries = initQueries
Beispiel #6
0
class LocaldataTestCase(unittest.TestCase):
    def setUp(self):
        data_file = os.path.abspath(
            os.path.dirname(__file__) + os.path.sep +
            "../../") + '/example/yelp_3000.pkl'
        self.localdata = LocalData(data_file, 'pkl', "row['business_id']",
                                   ["row['name']"],
                                   ["row['name']", "row['full_address']"])

    def tearDown(self):
        self.localdata = None

    def test_loadLocalData(self):
        self.localdata.setlocalData(None, None, None)
        self.localdata.read_pickle()
        localdata_ids, localdata_query, localdata_er = self.localdata.getlocalData(
        )
        assert len(localdata_ids) == 3000
Beispiel #7
0
              result_file.csv
              match_file.pkl
              match_file.csv
              enriched_file.pkl
              enriched_file.csv
"""
sample_file = 'dblp_sample.csv'
localdata_file = 'dblp_3881.csv'
result_dir = 'dblp_result'
sampledata = SampleData(sample_ratio=0.5,
                        samplepath=sample_file,
                        filetype='csv',
                        uniqueid="key",
                        querylist=["title"])
localdata = LocalData(localpath=localdata_file,
                      filetype='csv',
                      uniqueid="ID",
                      querylist=['title'],
                      matchlist=['title'])
hiddendata = HiddenData(result_dir=result_dir,
                        uniqueid="info.key",
                        matchlist=["info.title"])
budget = 20
smartcrawl.smartCrawl(budget, dblp, sampledata, localdata, hiddendata)
"""
pool_thre = 2
jaccard_thre = 0.85
threads = 4
smartcrawl.smartCrawl(budget, dblp, sampledata, localdata, hiddendata, pool_thre, jaccard_thre, threads)
"""
Beispiel #8
0
yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5, search_term=search_term,
                 **parameters)

"""
\yelp_sample_AZ.pkl
 yelp_3000_AZ.csv
 yelp_result\\result_file.pkl
              result_file.csv
              match_file.pkl
              match_file.csv
              enriched_file.pkl
              enriched_file.csv
"""
sample_file = 'yelp_sample_AZ.pkl'
localdata_file = 'yelp_3000_AZ.csv'
result_dir = 'yelp_result'
sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='pkl', uniqueid="business_id", querylist=["name"])
localdata = LocalData(localpath=localdata_file, filetype='csv', uniqueid="business_id",
                      querylist=["name"],
                      matchlist=["name", "full_address"])
hiddendata = HiddenData(result_dir=result_dir, uniqueid="id",
                        matchlist=["name", "location.display_address.*"])
budget = 20
smartcrawl.smartCrawl(budget, yelp, sampledata, localdata, hiddendata)
"""
pool_thre = 2
jaccard_thre = 0.85
threads = 4
smartcrawl.smartCrawl(budget, yelp, sampledata, localdata, hiddendata, pool_thre, jaccard_thre, threads)
"""
Beispiel #9
0
 def setUp(self):
     data_file = os.path.abspath(os.path.dirname(__file__) + os.path.sep + "../../") + '/yelp_example/yelp_3000_AZ.csv'
     self.localdata = LocalData(data_file, 'csv', "business_id", ["name"],
                                ["name", "full_address"])
from deeperlib.api.yelp.searchapi import SearchApi
from deeperlib.core import utils
from deeperlib.data_processing.local_data import LocalData
from deeperlib.estimator import sampler

# ==== Sota-Sampler Yelp ====
client_id = "kCe2YbZePXsPnC204ZrXoQ"
client_secret = "s9KnvEEQW7jaA2wlrBi4X2fnDQ0F7asdklXVvJUidWp8i50ov24E8EjkHX2AUhoL"
search_term = 'term'
parameters = {'limit': 50, 'location': 'AZ'}
yelp = SearchApi(client_id=client_id,
                 client_secret=client_secret,
                 top_k=300,
                 delay=5,
                 search_term=search_term,
                 **parameters)
local_file = 'yelp_3000.pkl'
localdata = LocalData(local_file, 'pkl', "row['business_id']", ["row['name']"],
                      ["row['name']", "row['full_address']"])
localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
initQueries = utils.queryGene(localdata_query, 2)
sampler.sota_sampler(query_pool=initQueries,
                     api=yelp,
                     match_term=localdata.getQueryList(),
                     top_k=300,
                     adjustment=1)
yelp.getSession().close()
Beispiel #11
0
from deeperlib.api.yelp.searchapi import SearchApi
from deeperlib.core import utils
from deeperlib.data_processing.local_data import LocalData
from deeperlib.estimator import sampler

# ==== Sota-Sampler Yelp ====
client_id = "kCe2YbZePXsPnC204ZrXoQ"
client_secret = "s9KnvEEQW7jaA2wlrBi4X2fnDQ0F7asdklXVvJUidWp8i50ov24E8EjkHX2AUhoL"
search_term = 'term'
parameters = {'limit': 50, 'location': 'AZ'}
yelp = SearchApi(client_id=client_id,
                 client_secret=client_secret,
                 top_k=300,
                 delay=5,
                 search_term=search_term,
                 **parameters)
local_file = 'yelp_3000.pkl'
localdata = LocalData(local_file, 'pkl', "business_id", ["name"],
                      ["name", "full_address"])
localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
initQueries = utils.queryGene(localdata_query, 2)
sampler.sota_sampler(query_pool=initQueries,
                     api=yelp,
                     match_term=localdata.getQueryList(),
                     top_k=300,
                     adjustment=1)
yelp.getSession().close()
Beispiel #12
0
from deeperlib.api.dblp.publapi import PublApi
from deeperlib.core import utils
from deeperlib.data_processing.local_data import LocalData
from deeperlib.estimator import aggregation

# ==== Sota-Estimator Dblp ====
search_term = 'q'
parameters = {'h': 1000}
dblp = PublApi(delay=5, search_term=search_term, **parameters)
localdata_file = 'dblp_10000'
localdata = LocalData(localdata_file, 'pkl', "row['key']", ["row['title']"],
                      ["row['title']"])
localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
initQueries = utils.queryGene(localdata_query, 2)
aggregation.sota_estimator(query_pool=initQueries,
                           api=dblp,
                           match_term=["row['info']['title']"],
                           uniqueid="row['info']['key']",
                           query_num=1)

# ==== Stratified-Estimator Dblp ====
dblp = PublApi(delay=5, search_term=search_term, **parameters)
aggregation.stratified_estimator(query_pool=initQueries,
                                 api=dblp,
                                 match_term=["row['info']['title']"],
                                 candidate_rate=0.2,
                                 query_num=100)
dblp.getSession().close()
from deeperlib.api.dblp.publapi import PublApi
from deeperlib.core import utils
from deeperlib.data_processing.local_data import LocalData
from deeperlib.estimator import aggregation

# ==== Sota-Estimator Dblp ====
search_term = 'q'
parameters = {'h': 1000}
dblp = PublApi(top_k=1000, delay=5, search_term=search_term, **parameters)
localdata_file = 'dblp_sample.csv'
localdata = LocalData(localdata_file, 'csv', "key", ["title"], ["title"])
localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
initQueries = utils.queryGene(localdata_query, 2)
aggregation.sota_estimator(query_pool=initQueries,
                           api=dblp,
                           match_term=["info.title"],
                           uniqueid="info.key",
                           query_num=1)

# ==== Stratified-Estimator Dblp ====
aggregation.stratified_estimator(query_pool=initQueries,
                                 api=dblp,
                                 match_term=["info.title"],
                                 candidate_rate=0.2,
                                 query_num=100)
dblp.getSession().close()
Beispiel #14
0
format     The result format of the search. Recognized values are "xml",     xml      ...?q=test&format=json
           "json", and "jsonp".
h          Maximum number of search results (hits) to return. For bandwidth  30       ...?q=test&h=100
           reasons, this number is capped at 1000.
f          The first hit in the numbered sequence of search results 
           (starting with 0) to return. In combination with the h parameter, 0        ...?q=test&h=100&f=300  
           this parameter can be used for pagination of search results.
c          Maximum number of completion terms (see below) to return. For     10       ...?q=test&c=0
           bandwidth reasons, this number is capped at 1000.
"""
search_term = 'q'
parameters = {'h': 1000}
dblp = PublApi(delay=5, search_term=search_term, **parameters)

"""
\dblp_sample.pkl
 dblp_10000.pkl
 dblp_result\\result_file.pkl
              result_file.csv
              match_file.pkl
              match_file.csv
"""
sample_file = 'dblp_sample.pkl'
localdata_file = 'dblp_10000.pkl'
result_dir = 'dblp_result'
sampledata = SampleData(samplepath=sample_file, filetype='pkl', uniqueid="row['key']", querylist=["row['title']"])
localdata = LocalData(localpath=localdata_file, filetype='pkl', uniqueid="row['key']", querylist=["row['title']"],
                      matchlist=["row['title']"])
hiddendata = HiddenData(result_dir=result_dir, uniqueid="row['info']['key']", matchlist=["row['info']['title']"])
smartcrawl.smartCrawl(top_k, count, pool_thre, jaccard_thre, threads, budget, dblp, sampledata, localdata, hiddendata)
Beispiel #15
0
 yelp_result\\result_file.pkl
              result_file.csv
              match_file.pkl
              match_file.csv
"""
sample_file = 'yelp_sample.pkl'
localdata_file = 'yelp_3000.pkl'
result_dir = 'yelp_result'
sampledata = SampleData(samplepath=sample_file,
                        filetype='pkl',
                        uniqueid="row['id']",
                        querylist=["row['name']"])
"""
localdata = LocalData(localpath=localdata_file, filetype='pkl', uniqueid="row['business_id']",
                      querylist=["row['name']"],
                      matchlist=["row['name']", "row['full_address']"])
hiddendata = HiddenData(result_dir=result_dir, uniqueid="row['id']",
                        matchlist=["row['name']", "' '.join(row['location']['display_address'])"])
"""
localdata = LocalData(localpath='yelp_5882.csv',
                      filetype='csv',
                      uniqueid='ID',
                      querylist=['NAME'],
                      matchlist=['NAME', 'ADDRESS'])
hiddendata = HiddenData(
    result_dir=result_dir,
    uniqueid="row['id']",
    matchlist=["row['name']", "' '.join(row['location']['display_address'])"])
smartcrawl.smartCrawl(top_k, count, pool_thre, jaccard_thre, threads, budget,
                      yelp, sampledata, localdata, hiddendata)