class AggregationTestCase(unittest.TestCase): def setUp(self): search_term = 'q' parameters = {'h': 1000} self.dblp = PublApi(top_k=1000, delay=5, search_term=search_term, **parameters) localdata_file = os.path.abspath( os.path.dirname(__file__) + os.path.sep + "../../") + '/csv_example/dblp_sample.csv' localdata = LocalData(localdata_file, 'csv', "key", ["title"], ["title"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) self.initQueries = initQueries def test_stra_stratified_estimator(self): aggregation.stratified_estimator(query_pool=self.initQueries, api=self.dblp, match_term=["info.title"], candidate_rate=0.2, query_num=100) assert True def test_sota_estimator(self): aggregation.sota_estimator(query_pool=self.initQueries, api=self.dblp, match_term=["info.title"], uniqueid="info.key", query_num=1) self.dblp.getSession().close() assert True
def setUp(self): search_term = 'q' parameters = {'h': 1000} self.dblp = PublApi(delay=5, search_term=search_term, **parameters) localdata_file = os.path.abspath( os.path.dirname(__file__) + os.path.sep + "../../") + '/example/dblp_10000.pkl' localdata = LocalData(localdata_file, 'pkl', "row['key']", ["row['title']"], ["row['title']"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) self.initQueries = initQueries
def setUp(self): search_term = 'q' parameters = {'h': 1000} self.dblp = PublApi(top_k=1000, delay=5, search_term=search_term, **parameters) localdata_file = os.path.abspath( os.path.dirname(__file__) + os.path.sep + "../../") + '/csv_example/dblp_sample.csv' localdata = LocalData(localdata_file, 'csv', "key", ["title"], ["title"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) self.initQueries = initQueries
class DblpPublapiTestCase(unittest.TestCase): def setUp(self): search_term = 'q' parameters = {'h': 1000} self.dblp = PublApi(top_k=1000, delay=5, search_term=search_term, **parameters) def tearDown(self): self.dblp = None def test_callApi(self): query = ['set', 'cover'] params = self.dblp.getKwargs() params[self.dblp.getSearchTerm()] = '+'.join(query) hitList = self.dblp.callAPI(params=params) assert len(hitList) >= 900 def test_callMulApi(self): queries = [['set', 'cover'], ['approximate', 'query']] hitList = self.dblp.callMulAPI(queries) self.dblp.getSession().close() assert len(hitList) >= 1200
Parameter Description Default Example q The query string to search for, as described on a separate page. ...?q=test+search format The result format of the search. Recognized values are "xml", xml ...?q=test&format=json "json", and "jsonp". h Maximum number of search results (hits) to return. For bandwidth 30 ...?q=test&h=100 reasons, this number is capped at 1000. f The first hit in the numbered sequence of search results (starting with 0) to return. In combination with the h parameter, 0 ...?q=test&h=100&f=300 this parameter can be used for pagination of search results. c Maximum number of completion terms (see below) to return. For 10 ...?q=test&c=0 bandwidth reasons, this number is capped at 1000. """ search_term = 'q' parameters = {'h': 1000} dblp = PublApi(top_k=1000, delay=5, search_term=search_term, **parameters) """ \dblp_sample.csv dblp_3881.csv dblp_result\\result_file.pkl result_file.csv match_file.pkl match_file.csv enriched_file.pkl enriched_file.csv """ sample_file = 'dblp_sample.csv' localdata_file = 'dblp_3881.csv' result_dir = 'dblp_result' sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file,
from deeperlib.api.dblp.publapi import PublApi from deeperlib.core import utils from deeperlib.data_processing.local_data import LocalData from deeperlib.estimator import aggregation # ==== Sota-Estimator Dblp ==== search_term = 'q' parameters = {'h': 1000} dblp = PublApi(delay=5, search_term=search_term, **parameters) localdata_file = 'dblp_10000' localdata = LocalData(localdata_file, 'pkl', "row['key']", ["row['title']"], ["row['title']"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) aggregation.sota_estimator(query_pool=initQueries, api=dblp, match_term=["row['info']['title']"], uniqueid="row['info']['key']", query_num=1) # ==== Stratified-Estimator Dblp ==== dblp = PublApi(delay=5, search_term=search_term, **parameters) aggregation.stratified_estimator(query_pool=initQueries, api=dblp, match_term=["row['info']['title']"], candidate_rate=0.2, query_num=100) dblp.getSession().close()
def Deeper_WEB(budget, api_msg, original_csv, local_match, hidden_match): typo_ids = [] parser = Deeper_HTMLParser() for i in range(1, len(original_csv)): for j in range(1, len(original_csv[i])): if '</span>' in original_csv[i][j]: parser.feed(original_csv[i][j]) original_csv[i][j] = parser.get_text() typo_ids.append(original_csv[i][0]) break if 'dblp Publ API' in api_msg[0]: search_term = 'q' parameters = {'h': 1000} api = PublApi(top_k=1000, delay=5, search_term=search_term, **parameters) sample_file = settings.BASE_DIR + '/netdisk/dblp_sample.csv' sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='csv', uniqueid="key", querylist=["title"]) hiddendata = HiddenData(uniqueid="info.key", matchlist=["info.title"]) if "info.key" in hidden_match: localdata = LocalData( uniqueid=local_match[hidden_match.index("info.key")], querylist=[local_match[hidden_match.index("info.title")]], matchlist=[local_match[hidden_match.index("info.title")]], data_raw=original_csv) else: uniqueID = 'AutoID' original_csv[0].append(uniqueID) for i in range(1, len(original_csv)): original_csv[i].append(i) localdata = LocalData( uniqueid=uniqueID, querylist=[local_match[hidden_match.index("info.title")]], matchlist=[local_match[hidden_match.index("info.title")]], data_raw=original_csv) elif 'aminer Publ API' in api_msg[0]: search_term = 'term' parameters = {'size': 100, 'sort': 'relevance'} api = AdvancedPublApi(top_k=300, delay=5, search_term=search_term, **parameters) sample_file = settings.BASE_DIR + '/netdisk/dblp_sample.csv' sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='csv', uniqueid="key", querylist=["title"]) hiddendata = HiddenData(uniqueid="id", matchlist=["title"]) if "id" in hidden_match: localdata = LocalData( uniqueid=local_match[hidden_match.index("id")], querylist=[local_match[hidden_match.index("title")]], matchlist=[local_match[hidden_match.index("title")]], data_raw=original_csv) else: uniqueID = 'AutoID' original_csv[0].append(uniqueID) for i in range(1, len(original_csv)): original_csv[i].append(i) localdata = LocalData( uniqueid=uniqueID, querylist=[local_match[hidden_match.index("title")]], matchlist=[local_match[hidden_match.index("title")]], data_raw=original_csv) elif 'yelp Search API' in api_msg[0]: client_id = "QhqrWe9agsd0Ad6Gs0qgMQ" client_secret = "6WQWRMV8edOhaThyWgm96wAJkIzJ1pHOhm5N0AD20edrnzv0lwi3wfgZAFp0IqQ6WIc-pZki83kjpViwptlcsiV0-Ij3HI6AJxhOTE4jsjNOoZOHZI3823twg8yZWXYx" search_term = 'term' parameters = {'limit': 50, 'location': api_msg[1]} api = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5, search_term=search_term, **parameters) sample_file = "%s/netdisk/yelp_sample_%s.pkl" % (settings.BASE_DIR, api_msg[1]) sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='pkl', uniqueid="business_id", querylist=["name"]) hiddendata = HiddenData( uniqueid="id", matchlist=["name", "location.display_address.*"]) if "id" in hidden_match: localdata = LocalData( uniqueid=local_match[hidden_match.index("id")], querylist=[local_match[hidden_match.index("name")]], matchlist=[ local_match[hidden_match.index("name")], local_match[ hidden_match.index("location.display_address.*")] ], data_raw=original_csv) else: uniqueID = 'AutoID' original_csv[0].append(uniqueID) for i in range(1, len(original_csv)): original_csv[i].append(i) localdata = LocalData( uniqueid=uniqueID, querylist=[local_match[hidden_match.index("name")]], matchlist=[ local_match[hidden_match.index("name")], local_match[ hidden_match.index("location.display_address.*")] ], data_raw=original_csv) elif 'google Place API' in api_msg[0]: search_term = 'query' parameters = {'key': 'AIzaSyDhBJSPqHfcEkPGQGbH7l3eWyF_PhF10iw'} api = TextSearchApi(location='in+' + api_msg[1], top_k=60, delay=5, search_term=search_term, **parameters) sample_file = "%s/netdisk/yelp_sample_%s.pkl" % (settings.BASE_DIR, api_msg[1]) sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='pkl', uniqueid="business_id", querylist=["name"]) hiddendata = HiddenData(uniqueid="place_id", matchlist=["name", "formatted_address"]) if "place_id" in hidden_match: localdata = LocalData( uniqueid=local_match[hidden_match.index("place_id")], querylist=[local_match[hidden_match.index("name")]], matchlist=[ local_match[hidden_match.index("name")], local_match[hidden_match.index("formatted_address")] ], data_raw=original_csv) else: uniqueID = 'AutoID' original_csv[0].append(uniqueID) for i in range(1, len(original_csv)): original_csv[i].append(i) localdata = LocalData( uniqueid=uniqueID, querylist=[local_match[hidden_match.index("name")]], matchlist=[ local_match[hidden_match.index("name")], local_match[hidden_match.index("formatted_address")] ], data_raw=original_csv) SmartCrawl(budget, api, sampledata, localdata, hiddendata) localdata_csv = localdata.getRawData() crawldata_csv = Json2csv(hiddendata.getMergeResult()).getCsvdata() result = {} result['smart_queries'] = hiddendata.getQueryList() result['record'] = [] if 'header' in crawldata_csv: result['local_header'] = localdata_csv['header'] result['hidden_header'] = crawldata_csv['header'] try: matchpair = sorted(hiddendata.getMatchPair().items(), key=lambda item: int(item[0]), reverse=False) except (ValueError): matchpair = sorted(hiddendata.getMatchPair().items(), key=lambda item: item[0], reverse=False) for m in matchpair: temp_record = [] local_id = m[0] temp_record.append(localdata_csv[local_id]) for hidden_id in m[1]: temp_record.append(crawldata_csv[hidden_id]) result['record'].append(temp_record) else: result['local_header'] = localdata_csv['header'] if 'dblp Publ API' in api_msg[0]: search_term = 'q' parameters = {'h': 1} api = PublApi(top_k=1, delay=5, search_term=search_term, **parameters) hiddendata = HiddenData(uniqueid="info.key", matchlist=["info.title"]) elif 'aminer Publ API' in api_msg[0]: search_term = 'term' parameters = {'size': 1, 'sort': 'relevance'} api = AdvancedPublApi(top_k=1, delay=5, search_term=search_term, **parameters) hiddendata = HiddenData(uniqueid="id", matchlist=["title"]) elif 'yelp Search API' in api_msg[0]: client_id = "QhqrWe9agsd0Ad6Gs0qgMQ" client_secret = "6WQWRMV8edOhaThyWgm96wAJkIzJ1pHOhm5N0AD20edrnzv0lwi3wfgZAFp0IqQ6WIc-pZki83kjpViwptlcsiV0-Ij3HI6AJxhOTE4jsjNOoZOHZI3823twg8yZWXYx" search_term = 'term' parameters = {'limit': 1, 'location': 'AZ'} api = SearchApi(client_id=client_id, client_secret=client_secret, top_k=1, delay=5, search_term=search_term, **parameters) hiddendata = HiddenData( uniqueid="id", matchlist=["name", "location.display_address.*"]) elif 'google Place API' in api_msg[0]: search_term = 'query' parameters = {'key': 'AIzaSyDhBJSPqHfcEkPGQGbH7l3eWyF_PhF10iw'} api = TextSearchApi(location='in+' + api_msg[1], top_k=60, delay=5, search_term=search_term, **parameters) hiddendata = HiddenData(uniqueid="place_id", matchlist=["name", "formatted_address"]) result['naive'] = NaiveCrawl(budget, api, localdata, hiddendata, typo_ids) result['naive_queries'] = hiddendata.getQueryList() return result
def setUp(self): search_term = 'q' parameters = {'h': 1000} self.dblp = PublApi(top_k=1000, delay=5, search_term=search_term, **parameters)
Parameter Description Default Example q The query string to search for, as described on a separate page. ...?q=test+search format The result format of the search. Recognized values are "xml", xml ...?q=test&format=json "json", and "jsonp". h Maximum number of search results (hits) to return. For bandwidth 30 ...?q=test&h=100 reasons, this number is capped at 1000. f The first hit in the numbered sequence of search results (starting with 0) to return. In combination with the h parameter, 0 ...?q=test&h=100&f=300 this parameter can be used for pagination of search results. c Maximum number of completion terms (see below) to return. For 10 ...?q=test&c=0 bandwidth reasons, this number is capped at 1000. """ search_term = 'q' parameters = {'h': 1000} dblp = PublApi(delay=5, search_term=search_term, **parameters) """ \dblp_sample.pkl dblp_10000.pkl dblp_result\\result_file.pkl result_file.csv match_file.pkl match_file.csv """ sample_file = 'dblp_sample.pkl' localdata_file = 'dblp_10000.pkl' result_dir = 'dblp_result' sampledata = SampleData(samplepath=sample_file, filetype='pkl', uniqueid="row['key']", querylist=["row['title']"]) localdata = LocalData(localpath=localdata_file, filetype='pkl', uniqueid="row['key']", querylist=["row['title']"], matchlist=["row['title']"])