class SamplerTestCase(unittest.TestCase): def setUp(self): client_id = "kCe2YbZePXsPnC204ZrXoQ" client_secret = "s9KnvEEQW7jaA2wlrBi4X2fnDQ0F7asdklXVvJUidWp8i50ov24E8EjkHX2AUhoL" search_term = 'term' parameters = {'limit': 50, 'location': 'AZ'} self.yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5, search_term=search_term, **parameters) def test_sota_sampler(self): local_file = os.path.abspath( os.path.dirname(__file__) + os.path.sep + "../../") + '/example/yelp_3000.pkl' localdata = LocalData(local_file, 'pkl', "row['business_id']", ["row['name']"], ["row['name']", "row['full_address']"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) sampler.sota_sampler(query_pool=initQueries, api=self.yelp, match_term=localdata.getQueryList(), top_k=300, adjustment=1, samplenum=1) self.yelp.getSession().close() assert True
class SamplerTestCase(unittest.TestCase): def setUp(self): client_id = "QhqrWe9agsd0Ad6Gs0qgMQ" client_secret = "6WQWRMV8edOhaThyWgm96wAJkIzJ1pHOhm5N0AD20edrnzv0lwi3wfgZAFp0IqQ6WIc-pZki83kjpViwptlcsiV0-Ij3HI6AJxhOTE4jsjNOoZOHZI3823twg8yZWXYx" search_term = 'term' parameters = {'limit': 50, 'location': 'AZ'} self.yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5, search_term=search_term, **parameters) def test_sota_sampler(self): local_file = os.path.abspath( os.path.dirname(__file__) + os.path.sep + "../../") + '/yelp_example/yelp_3000_AZ.csv' localdata = LocalData(local_file, 'csv', "business_id", ["name"], ["name", "full_address"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) sampler.sota_sampler(query_pool=initQueries, api=self.yelp, match_term=localdata.getQueryList(), top_k=300, adjustment=1, samplenum=1) self.yelp.getSession().close() assert True
def setUp(self): client_id = "QhqrWe9agsd0Ad6Gs0qgMQ" client_secret = "6WQWRMV8edOhaThyWgm96wAJkIzJ1pHOhm5N0AD20edrnzv0lwi3wfgZAFp0IqQ6WIc-pZki83kjpViwptlcsiV0-Ij3HI6AJxhOTE4jsjNOoZOHZI3823twg8yZWXYx" search_term = 'term' parameters = {'limit': 50, 'location': 'AZ'} self.yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5, search_term=search_term, **parameters)
def setUp(self): client_id = "kCe2YbZePXsPnC204ZrXoQ" client_secret = "s9KnvEEQW7jaA2wlrBi4X2fnDQ0F7asdklXVvJUidWp8i50ov24E8EjkHX2AUhoL" search_term = 'term' parameters = {'limit': 50, 'location': 'AZ'} self.yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=1000, delay=5, search_term=search_term, **parameters)
categories string Optional. Categories to filter the search results with. See the list of supported categories. The category filter can be a list of comma delimited categories. For example, "bars,french" will filter by Bars and French. The category identifier should be used (for example "discgolf", not "Disc Golf"). limit int Optional. Number of business results to return. By default, it will return 20. Maximum is 50. offset int Optional. Offset the list of returned business results by this amount. sort_by string Optional. Sort the results by one of the these modes: best_match, rating, review_count or distance. By default it's best_match. The rating sort is not strictly sorted by the rating value, but by an adjusted rating value that takes into account the number of ratings, similar to a bayesian average. This is so a business with 1 rating of 5 stars doesn't immediately jump to the top. """ client_id = client_secret = search_term = 'term' parameters = {'limit': 50, 'location': 'AZ'} yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5, search_term=search_term, **parameters) """ \yelp_sample_AZ.pkl yelp_3000_AZ.csv yelp_result\\result_file.pkl result_file.csv match_file.pkl match_file.csv enriched_file.pkl enriched_file.csv """ sample_file = 'yelp_sample_AZ.pkl' localdata_file = 'yelp_3000_AZ.csv' result_dir = 'yelp_result' sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='pkl', uniqueid="business_id", querylist=["name"])
from deeperlib.api.yelp.searchapi import SearchApi from deeperlib.core import utils from deeperlib.data_processing.local_data import LocalData from deeperlib.estimator import sampler # ==== Sota-Sampler Yelp ==== client_id = "kCe2YbZePXsPnC204ZrXoQ" client_secret = "s9KnvEEQW7jaA2wlrBi4X2fnDQ0F7asdklXVvJUidWp8i50ov24E8EjkHX2AUhoL" search_term = 'term' parameters = {'limit': 50, 'location': 'AZ'} yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5, search_term=search_term, **parameters) local_file = 'yelp_3000.pkl' localdata = LocalData(local_file, 'pkl', "row['business_id']", ["row['name']"], ["row['name']", "row['full_address']"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) sampler.sota_sampler(query_pool=initQueries, api=yelp, match_term=localdata.getQueryList(), top_k=300, adjustment=1) yelp.getSession().close()
class YelpSearchapiTestCase(unittest.TestCase): def setUp(self): client_id = "QhqrWe9agsd0Ad6Gs0qgMQ" client_secret = "6WQWRMV8edOhaThyWgm96wAJkIzJ1pHOhm5N0AD20edrnzv0lwi3wfgZAFp0IqQ6WIc-pZki83kjpViwptlcsiV0-Ij3HI6AJxhOTE4jsjNOoZOHZI3823twg8yZWXYx" search_term = 'term' parameters = {'limit': 50, 'location': 'AZ'} self.yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5, search_term=search_term, **parameters) def tearDown(self): self.yelp = None def test_callApi(self): query = ['tai', 'restaurant'] params = self.yelp.getKwargs() params[self.yelp.getSearchTerm()] = '+'.join(query) params['offset'] = 0 results = self.yelp.callAPI(params) assert len(results) >= 20 def test_callMulApi_term(self): queries = [['tai'], ['restaurant']] results = self.yelp.callMulAPI(queries) assert len(results) >= 200 def test_callMulApi_categories(self): self.yelp.setSearchTerm('categories') categories = [['bars'], ['french']] results = self.yelp.callMulAPI(categories) assert len(results) >= 200 def test_callMulApi_sort(self): self.yelp.setSearchTerm('sort_by') sort_by = [['rating'], ['best_match']] results = self.yelp.callMulAPI(sort_by) self.yelp.getSession().close() assert len(results) >= 200
def Deeper_WEB(budget, api_msg, original_csv, local_match, hidden_match): typo_ids = [] parser = Deeper_HTMLParser() for i in range(1, len(original_csv)): for j in range(1, len(original_csv[i])): if '</span>' in original_csv[i][j]: parser.feed(original_csv[i][j]) original_csv[i][j] = parser.get_text() typo_ids.append(original_csv[i][0]) break if 'dblp Publ API' in api_msg[0]: search_term = 'q' parameters = {'h': 1000} api = PublApi(top_k=1000, delay=5, search_term=search_term, **parameters) sample_file = settings.BASE_DIR + '/netdisk/dblp_sample.csv' sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='csv', uniqueid="key", querylist=["title"]) hiddendata = HiddenData(uniqueid="info.key", matchlist=["info.title"]) if "info.key" in hidden_match: localdata = LocalData( uniqueid=local_match[hidden_match.index("info.key")], querylist=[local_match[hidden_match.index("info.title")]], matchlist=[local_match[hidden_match.index("info.title")]], data_raw=original_csv) else: uniqueID = 'AutoID' original_csv[0].append(uniqueID) for i in range(1, len(original_csv)): original_csv[i].append(i) localdata = LocalData( uniqueid=uniqueID, querylist=[local_match[hidden_match.index("info.title")]], matchlist=[local_match[hidden_match.index("info.title")]], data_raw=original_csv) elif 'aminer Publ API' in api_msg[0]: search_term = 'term' parameters = {'size': 100, 'sort': 'relevance'} api = AdvancedPublApi(top_k=300, delay=5, search_term=search_term, **parameters) sample_file = settings.BASE_DIR + '/netdisk/dblp_sample.csv' sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='csv', uniqueid="key", querylist=["title"]) hiddendata = HiddenData(uniqueid="id", matchlist=["title"]) if "id" in hidden_match: localdata = LocalData( uniqueid=local_match[hidden_match.index("id")], querylist=[local_match[hidden_match.index("title")]], matchlist=[local_match[hidden_match.index("title")]], data_raw=original_csv) else: uniqueID = 'AutoID' original_csv[0].append(uniqueID) for i in range(1, len(original_csv)): original_csv[i].append(i) localdata = LocalData( uniqueid=uniqueID, querylist=[local_match[hidden_match.index("title")]], matchlist=[local_match[hidden_match.index("title")]], data_raw=original_csv) elif 'yelp Search API' in api_msg[0]: client_id = "QhqrWe9agsd0Ad6Gs0qgMQ" client_secret = "6WQWRMV8edOhaThyWgm96wAJkIzJ1pHOhm5N0AD20edrnzv0lwi3wfgZAFp0IqQ6WIc-pZki83kjpViwptlcsiV0-Ij3HI6AJxhOTE4jsjNOoZOHZI3823twg8yZWXYx" search_term = 'term' parameters = {'limit': 50, 'location': api_msg[1]} api = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5, search_term=search_term, **parameters) sample_file = "%s/netdisk/yelp_sample_%s.pkl" % (settings.BASE_DIR, api_msg[1]) sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='pkl', uniqueid="business_id", querylist=["name"]) hiddendata = HiddenData( uniqueid="id", matchlist=["name", "location.display_address.*"]) if "id" in hidden_match: localdata = LocalData( uniqueid=local_match[hidden_match.index("id")], querylist=[local_match[hidden_match.index("name")]], matchlist=[ local_match[hidden_match.index("name")], local_match[ hidden_match.index("location.display_address.*")] ], data_raw=original_csv) else: uniqueID = 'AutoID' original_csv[0].append(uniqueID) for i in range(1, len(original_csv)): original_csv[i].append(i) localdata = LocalData( uniqueid=uniqueID, querylist=[local_match[hidden_match.index("name")]], matchlist=[ local_match[hidden_match.index("name")], local_match[ hidden_match.index("location.display_address.*")] ], data_raw=original_csv) elif 'google Place API' in api_msg[0]: search_term = 'query' parameters = {'key': 'AIzaSyDhBJSPqHfcEkPGQGbH7l3eWyF_PhF10iw'} api = TextSearchApi(location='in+' + api_msg[1], top_k=60, delay=5, search_term=search_term, **parameters) sample_file = "%s/netdisk/yelp_sample_%s.pkl" % (settings.BASE_DIR, api_msg[1]) sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='pkl', uniqueid="business_id", querylist=["name"]) hiddendata = HiddenData(uniqueid="place_id", matchlist=["name", "formatted_address"]) if "place_id" in hidden_match: localdata = LocalData( uniqueid=local_match[hidden_match.index("place_id")], querylist=[local_match[hidden_match.index("name")]], matchlist=[ local_match[hidden_match.index("name")], local_match[hidden_match.index("formatted_address")] ], data_raw=original_csv) else: uniqueID = 'AutoID' original_csv[0].append(uniqueID) for i in range(1, len(original_csv)): original_csv[i].append(i) localdata = LocalData( uniqueid=uniqueID, querylist=[local_match[hidden_match.index("name")]], matchlist=[ local_match[hidden_match.index("name")], local_match[hidden_match.index("formatted_address")] ], data_raw=original_csv) SmartCrawl(budget, api, sampledata, localdata, hiddendata) localdata_csv = localdata.getRawData() crawldata_csv = Json2csv(hiddendata.getMergeResult()).getCsvdata() result = {} result['smart_queries'] = hiddendata.getQueryList() result['record'] = [] if 'header' in crawldata_csv: result['local_header'] = localdata_csv['header'] result['hidden_header'] = crawldata_csv['header'] try: matchpair = sorted(hiddendata.getMatchPair().items(), key=lambda item: int(item[0]), reverse=False) except (ValueError): matchpair = sorted(hiddendata.getMatchPair().items(), key=lambda item: item[0], reverse=False) for m in matchpair: temp_record = [] local_id = m[0] temp_record.append(localdata_csv[local_id]) for hidden_id in m[1]: temp_record.append(crawldata_csv[hidden_id]) result['record'].append(temp_record) else: result['local_header'] = localdata_csv['header'] if 'dblp Publ API' in api_msg[0]: search_term = 'q' parameters = {'h': 1} api = PublApi(top_k=1, delay=5, search_term=search_term, **parameters) hiddendata = HiddenData(uniqueid="info.key", matchlist=["info.title"]) elif 'aminer Publ API' in api_msg[0]: search_term = 'term' parameters = {'size': 1, 'sort': 'relevance'} api = AdvancedPublApi(top_k=1, delay=5, search_term=search_term, **parameters) hiddendata = HiddenData(uniqueid="id", matchlist=["title"]) elif 'yelp Search API' in api_msg[0]: client_id = "QhqrWe9agsd0Ad6Gs0qgMQ" client_secret = "6WQWRMV8edOhaThyWgm96wAJkIzJ1pHOhm5N0AD20edrnzv0lwi3wfgZAFp0IqQ6WIc-pZki83kjpViwptlcsiV0-Ij3HI6AJxhOTE4jsjNOoZOHZI3823twg8yZWXYx" search_term = 'term' parameters = {'limit': 1, 'location': 'AZ'} api = SearchApi(client_id=client_id, client_secret=client_secret, top_k=1, delay=5, search_term=search_term, **parameters) hiddendata = HiddenData( uniqueid="id", matchlist=["name", "location.display_address.*"]) elif 'google Place API' in api_msg[0]: search_term = 'query' parameters = {'key': 'AIzaSyDhBJSPqHfcEkPGQGbH7l3eWyF_PhF10iw'} api = TextSearchApi(location='in+' + api_msg[1], top_k=60, delay=5, search_term=search_term, **parameters) hiddendata = HiddenData(uniqueid="place_id", matchlist=["name", "formatted_address"]) result['naive'] = NaiveCrawl(budget, api, localdata, hiddendata, typo_ids) result['naive_queries'] = hiddendata.getQueryList() return result
class YelpSearchapiTestCase(unittest.TestCase): def setUp(self): client_id = "kCe2YbZePXsPnC204ZrXoQ" client_secret = "s9KnvEEQW7jaA2wlrBi4X2fnDQ0F7asdklXVvJUidWp8i50ov24E8EjkHX2AUhoL" search_term = 'term' parameters = {'limit': 50, 'location': 'AZ'} self.yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=1000, delay=5, search_term=search_term, **parameters) def tearDown(self): self.yelp = None def test_callApi(self): query = ['tai', 'restaurant'] params = self.yelp.getKwargs() params[self.yelp.getSearchTerm()] = '+'.join(query) params['offset'] = 0 results = self.yelp.callAPI(params) assert len(results) >= 20 def test_callMulApi_term(self): queries = [['tai'], ['restaurant']] results = self.yelp.callMulAPI(queries) assert len(results) >= 1000 def test_callMulApi_categories(self): self.yelp.setSearchTerm('categories') categories = [['bars'], ['french']] results = self.yelp.callMulAPI(categories) print len(results) assert len(results) >= 700 def test_callMulApi_sort(self): self.yelp.setSearchTerm('sort_by') sort_by = [['rating'], ['best_match']] results = self.yelp.callMulAPI(sort_by) self.yelp.getSession().close() assert len(results) >= 1000