Example #1
0
 def test_sota_sampler(self):
     local_file = os.path.abspath(os.path.dirname(__file__) + os.path.sep + "../../") + '/pkl_example/yelp_3000.pkl'
     localdata = LocalData(local_file, 'pkl', "business_id", ["name"], ["name", "full_address"])
     localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
     initQueries = utils.queryGene(localdata_query, 2)
     sampler.sota_sampler(query_pool=initQueries, api=self.yelp, match_term=localdata.getQueryList(), top_k=300,
                          adjustment=1, samplenum=1)
     self.yelp.getSession().close()
     assert True
Example #2
0
 def setUp(self):
     search_term = 'q'
     parameters = {'h': 1000}
     self.dblp = PublApi(delay=5, search_term=search_term, **parameters)
     localdata_file = os.path.abspath(
         os.path.dirname(__file__) + os.path.sep +
         "../../") + '/example/dblp_10000.pkl'
     localdata = LocalData(localdata_file, 'pkl', "row['key']",
                           ["row['title']"], ["row['title']"])
     localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
     initQueries = utils.queryGene(localdata_query, 2)
     self.initQueries = initQueries
Example #3
0
 def setUp(self):
     search_term = 'q'
     parameters = {'h': 1000}
     self.dblp = PublApi(top_k=1000,
                         delay=5,
                         search_term=search_term,
                         **parameters)
     localdata_file = os.path.abspath(
         os.path.dirname(__file__) + os.path.sep +
         "../../") + '/csv_example/dblp_sample.csv'
     localdata = LocalData(localdata_file, 'csv', "key", ["title"],
                           ["title"])
     localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
     initQueries = utils.queryGene(localdata_query, 2)
     self.initQueries = initQueries
from deeperlib.api.yelp.searchapi import SearchApi
from deeperlib.core import utils
from deeperlib.data_processing.local_data import LocalData
from deeperlib.estimator import sampler

# ==== Sota-Sampler Yelp ====
client_id = "kCe2YbZePXsPnC204ZrXoQ"
client_secret = "s9KnvEEQW7jaA2wlrBi4X2fnDQ0F7asdklXVvJUidWp8i50ov24E8EjkHX2AUhoL"
search_term = 'term'
parameters = {'limit': 50, 'location': 'AZ'}
yelp = SearchApi(client_id=client_id,
                 client_secret=client_secret,
                 top_k=300,
                 delay=5,
                 search_term=search_term,
                 **parameters)
local_file = 'yelp_3000.pkl'
localdata = LocalData(local_file, 'pkl', "row['business_id']", ["row['name']"],
                      ["row['name']", "row['full_address']"])
localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
initQueries = utils.queryGene(localdata_query, 2)
sampler.sota_sampler(query_pool=initQueries,
                     api=yelp,
                     match_term=localdata.getQueryList(),
                     top_k=300,
                     adjustment=1)
yelp.getSession().close()
Example #5
0
def SmartCrawl(budget, api, sampledata, localdata, hiddendata, pool_thre=2, jaccard_thre=0.75, threads=4):
    time_s = timeit.default_timer()
    sample = sampledata.getSample()
    D1_ids, D1_query, D1_er = localdata.getlocalData()

    top_k = api.getTopk()
    sample_rate = sampledata.getRatio() / 100.0
    Dratio = 1.0 * len(D1_ids) * sample_rate / len(sample)

    time_e = timeit.default_timer()
    print >> perr, time_e - time_s, 'data loaded.'

    time_s = timeit.default_timer()
    initQueries = utils.queryGene(D1_query, pool_thre)
    time_e = timeit.default_timer()
    print >> perr, time_e - time_s, 'query pool finished.'

    #####inverted index #####
    time_s = timeit.default_timer()
    D1index = utils.invertedIndex(initQueries, D1_query)
    initQueries, D1index = utils.add_naiveIndex(initQueries, D1_query, D1index)
    sampleindex = utils.invertedIndex(initQueries, sample)
    time_e = timeit.default_timer()
    print >> perr, time_e - time_s, 'index building finished.'
    #####forward index #####
    time_s = timeit.default_timer()
    findex = utils.forwardIndex(D1index)
    time_e = timeit.default_timer()
    print >> perr, time_e - time_s, 'forward index'

    ##### biased #####
    D1_ids_deeper = copy.deepcopy(D1_ids)
    query_pool = utils.initScore_biased(sampleindex, top_k, sample_rate, Dratio, initQueries)
    flagNum = len(query_pool) - budget
    curcov = set()
    curmat = []
    updateList = utils.updateList(D1index)
    queryList = []

    while len(query_pool) > flagNum and len(query_pool) != 0 and len(curcov) < len(D1_ids):
        queries = []
        while len(queries) < threads:
            if len(query_pool) > flagNum and len(query_pool) > 0:
                top = query_pool.popitem()
                if updateList[top[0]] != 0:
                    if len(sampleindex[top[0]]) <= top_k * sample_rate:
                        if len(sampleindex[top[0]]) == 0 and len(D1index[top[0]]) > (top_k * Dratio):
                            new_priority = top[1] - updateList[top[0]] * top_k * Dratio / len(D1index[top[0]])
                        else:
                            new_priority = top[1] - updateList[top[0]]
                    else:
                        new_priority = top[1] - updateList[top[0]] * top_k * sample_rate / len(sampleindex[top[0]])
                    query_pool.additem(top[0], new_priority)
                    updateList[top[0]] = 0
                    continue
                else:
                    queries.append(list(top[0]))
            else:
                break

        queryList.extend(queries)
        cur_raw_result = api.callMulAPI(queries)
        cur_er_result = hiddendata.proResult(cur_raw_result)
        matched_ids, matched_pair = utils.results_simjoin(cur_er_result, D1_er, jaccard_thre)
        removed_ids = D1_ids_deeper.intersection(matched_ids)
        for d in removed_ids:
            for q in findex[d]:
                updateList[q] += 1

        D1_ids_deeper.difference_update(matched_ids)
        curcov = curcov.union(matched_ids)
        curmat.extend(matched_pair)
        print >> perr, 'smartcrawl, coverage ratio:', 100.0 * len(curcov) / len(D1_ids), '%, ', \
            len(cur_raw_result), 'results returned, ', \
            len(matched_ids), 'local records covered at this iteration. ', \
            len(hiddendata.getMergeResult()), 'different results returned, ', \
            len(curcov), 'local records covered totally.'
    api.getSession().close()
    hiddendata.setQueryList(queryList)
    hiddendata.setMatchPair(curmat)