Example #1
0
class SamplerTestCase(unittest.TestCase):
    def setUp(self):
        client_id = "kCe2YbZePXsPnC204ZrXoQ"
        client_secret = "s9KnvEEQW7jaA2wlrBi4X2fnDQ0F7asdklXVvJUidWp8i50ov24E8EjkHX2AUhoL"
        search_term = 'term'
        parameters = {'limit': 50, 'location': 'AZ'}
        self.yelp = SearchApi(client_id=client_id,
                              client_secret=client_secret,
                              top_k=300,
                              delay=5,
                              search_term=search_term,
                              **parameters)

    def test_sota_sampler(self):
        local_file = os.path.abspath(
            os.path.dirname(__file__) + os.path.sep +
            "../../") + '/example/yelp_3000.pkl'
        localdata = LocalData(local_file, 'pkl', "row['business_id']",
                              ["row['name']"],
                              ["row['name']", "row['full_address']"])
        localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
        initQueries = utils.queryGene(localdata_query, 2)
        sampler.sota_sampler(query_pool=initQueries,
                             api=self.yelp,
                             match_term=localdata.getQueryList(),
                             top_k=300,
                             adjustment=1,
                             samplenum=1)
        self.yelp.getSession().close()
        assert True
Example #2
0
class SamplerTestCase(unittest.TestCase):
    def setUp(self):
        client_id = "QhqrWe9agsd0Ad6Gs0qgMQ"
        client_secret = "6WQWRMV8edOhaThyWgm96wAJkIzJ1pHOhm5N0AD20edrnzv0lwi3wfgZAFp0IqQ6WIc-pZki83kjpViwptlcsiV0-Ij3HI6AJxhOTE4jsjNOoZOHZI3823twg8yZWXYx"
        search_term = 'term'
        parameters = {'limit': 50, 'location': 'AZ'}
        self.yelp = SearchApi(client_id=client_id,
                              client_secret=client_secret,
                              top_k=300,
                              delay=5,
                              search_term=search_term,
                              **parameters)

    def test_sota_sampler(self):
        local_file = os.path.abspath(
            os.path.dirname(__file__) + os.path.sep +
            "../../") + '/yelp_example/yelp_3000_AZ.csv'
        localdata = LocalData(local_file, 'csv', "business_id", ["name"],
                              ["name", "full_address"])
        localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
        initQueries = utils.queryGene(localdata_query, 2)
        sampler.sota_sampler(query_pool=initQueries,
                             api=self.yelp,
                             match_term=localdata.getQueryList(),
                             top_k=300,
                             adjustment=1,
                             samplenum=1)
        self.yelp.getSession().close()
        assert True
Example #3
0
 def setUp(self):
     client_id = "QhqrWe9agsd0Ad6Gs0qgMQ"
     client_secret = "6WQWRMV8edOhaThyWgm96wAJkIzJ1pHOhm5N0AD20edrnzv0lwi3wfgZAFp0IqQ6WIc-pZki83kjpViwptlcsiV0-Ij3HI6AJxhOTE4jsjNOoZOHZI3823twg8yZWXYx"
     search_term = 'term'
     parameters = {'limit': 50, 'location': 'AZ'}
     self.yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5,
                           search_term=search_term,
                           **parameters)
Example #4
0
 def setUp(self):
     client_id = "kCe2YbZePXsPnC204ZrXoQ"
     client_secret = "s9KnvEEQW7jaA2wlrBi4X2fnDQ0F7asdklXVvJUidWp8i50ov24E8EjkHX2AUhoL"
     search_term = 'term'
     parameters = {'limit': 50, 'location': 'AZ'}
     self.yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=1000, delay=5,
                           search_term=search_term,
                           **parameters)
Example #5
0
categories  string  Optional. Categories to filter the search results with. See the list of supported categories. 
                    The category filter can be a list of comma delimited categories. For example, "bars,french" 
                    will filter by Bars and French. The category identifier should be used (for example "discgolf", 
                    not "Disc Golf").
limit       int     Optional. Number of business results to return. By default, it will return 20. Maximum is 50.
offset      int     Optional. Offset the list of returned business results by this amount.
sort_by     string  Optional. Sort the results by one of the these modes: best_match, rating, review_count or distance. 
                    By default it's best_match. The rating sort is not strictly sorted by the rating value, but by an 
                    adjusted rating value that takes into account the number of ratings, similar to a bayesian average. 
                    This is so a business with 1 rating of 5 stars doesn't immediately jump to the top. 
"""
client_id = 
client_secret = 
search_term = 'term'
parameters = {'limit': 50, 'location': 'AZ'}
yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5, search_term=search_term,
                 **parameters)

"""
\yelp_sample_AZ.pkl
 yelp_3000_AZ.csv
 yelp_result\\result_file.pkl
              result_file.csv
              match_file.pkl
              match_file.csv
              enriched_file.pkl
              enriched_file.csv
"""
sample_file = 'yelp_sample_AZ.pkl'
localdata_file = 'yelp_3000_AZ.csv'
result_dir = 'yelp_result'
sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='pkl', uniqueid="business_id", querylist=["name"])
from deeperlib.api.yelp.searchapi import SearchApi
from deeperlib.core import utils
from deeperlib.data_processing.local_data import LocalData
from deeperlib.estimator import sampler

# ==== Sota-Sampler Yelp ====
client_id = "kCe2YbZePXsPnC204ZrXoQ"
client_secret = "s9KnvEEQW7jaA2wlrBi4X2fnDQ0F7asdklXVvJUidWp8i50ov24E8EjkHX2AUhoL"
search_term = 'term'
parameters = {'limit': 50, 'location': 'AZ'}
yelp = SearchApi(client_id=client_id,
                 client_secret=client_secret,
                 top_k=300,
                 delay=5,
                 search_term=search_term,
                 **parameters)
local_file = 'yelp_3000.pkl'
localdata = LocalData(local_file, 'pkl', "row['business_id']", ["row['name']"],
                      ["row['name']", "row['full_address']"])
localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
initQueries = utils.queryGene(localdata_query, 2)
sampler.sota_sampler(query_pool=initQueries,
                     api=yelp,
                     match_term=localdata.getQueryList(),
                     top_k=300,
                     adjustment=1)
yelp.getSession().close()
Example #7
0
class YelpSearchapiTestCase(unittest.TestCase):
    def setUp(self):
        client_id = "QhqrWe9agsd0Ad6Gs0qgMQ"
        client_secret = "6WQWRMV8edOhaThyWgm96wAJkIzJ1pHOhm5N0AD20edrnzv0lwi3wfgZAFp0IqQ6WIc-pZki83kjpViwptlcsiV0-Ij3HI6AJxhOTE4jsjNOoZOHZI3823twg8yZWXYx"
        search_term = 'term'
        parameters = {'limit': 50, 'location': 'AZ'}
        self.yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5,
                              search_term=search_term,
                              **parameters)

    def tearDown(self):
        self.yelp = None

    def test_callApi(self):
        query = ['tai', 'restaurant']
        params = self.yelp.getKwargs()
        params[self.yelp.getSearchTerm()] = '+'.join(query)
        params['offset'] = 0
        results = self.yelp.callAPI(params)
        assert len(results) >= 20

    def test_callMulApi_term(self):
        queries = [['tai'], ['restaurant']]
        results = self.yelp.callMulAPI(queries)
        assert len(results) >= 200

    def test_callMulApi_categories(self):
        self.yelp.setSearchTerm('categories')
        categories = [['bars'], ['french']]
        results = self.yelp.callMulAPI(categories)
        assert len(results) >= 200

    def test_callMulApi_sort(self):
        self.yelp.setSearchTerm('sort_by')
        sort_by = [['rating'], ['best_match']]
        results = self.yelp.callMulAPI(sort_by)
        self.yelp.getSession().close()
        assert len(results) >= 200
Example #8
0
def Deeper_WEB(budget, api_msg, original_csv, local_match, hidden_match):
    typo_ids = []
    parser = Deeper_HTMLParser()
    for i in range(1, len(original_csv)):
        for j in range(1, len(original_csv[i])):
            if '</span>' in original_csv[i][j]:
                parser.feed(original_csv[i][j])
                original_csv[i][j] = parser.get_text()
                typo_ids.append(original_csv[i][0])
                break

    if 'dblp Publ API' in api_msg[0]:
        search_term = 'q'
        parameters = {'h': 1000}
        api = PublApi(top_k=1000,
                      delay=5,
                      search_term=search_term,
                      **parameters)
        sample_file = settings.BASE_DIR + '/netdisk/dblp_sample.csv'
        sampledata = SampleData(sample_ratio=0.5,
                                samplepath=sample_file,
                                filetype='csv',
                                uniqueid="key",
                                querylist=["title"])
        hiddendata = HiddenData(uniqueid="info.key", matchlist=["info.title"])

        if "info.key" in hidden_match:
            localdata = LocalData(
                uniqueid=local_match[hidden_match.index("info.key")],
                querylist=[local_match[hidden_match.index("info.title")]],
                matchlist=[local_match[hidden_match.index("info.title")]],
                data_raw=original_csv)
        else:
            uniqueID = 'AutoID'
            original_csv[0].append(uniqueID)
            for i in range(1, len(original_csv)):
                original_csv[i].append(i)
            localdata = LocalData(
                uniqueid=uniqueID,
                querylist=[local_match[hidden_match.index("info.title")]],
                matchlist=[local_match[hidden_match.index("info.title")]],
                data_raw=original_csv)
    elif 'aminer Publ API' in api_msg[0]:
        search_term = 'term'
        parameters = {'size': 100, 'sort': 'relevance'}
        api = AdvancedPublApi(top_k=300,
                              delay=5,
                              search_term=search_term,
                              **parameters)
        sample_file = settings.BASE_DIR + '/netdisk/dblp_sample.csv'
        sampledata = SampleData(sample_ratio=0.5,
                                samplepath=sample_file,
                                filetype='csv',
                                uniqueid="key",
                                querylist=["title"])
        hiddendata = HiddenData(uniqueid="id", matchlist=["title"])

        if "id" in hidden_match:
            localdata = LocalData(
                uniqueid=local_match[hidden_match.index("id")],
                querylist=[local_match[hidden_match.index("title")]],
                matchlist=[local_match[hidden_match.index("title")]],
                data_raw=original_csv)
        else:
            uniqueID = 'AutoID'
            original_csv[0].append(uniqueID)
            for i in range(1, len(original_csv)):
                original_csv[i].append(i)
            localdata = LocalData(
                uniqueid=uniqueID,
                querylist=[local_match[hidden_match.index("title")]],
                matchlist=[local_match[hidden_match.index("title")]],
                data_raw=original_csv)
    elif 'yelp Search API' in api_msg[0]:
        client_id = "QhqrWe9agsd0Ad6Gs0qgMQ"
        client_secret = "6WQWRMV8edOhaThyWgm96wAJkIzJ1pHOhm5N0AD20edrnzv0lwi3wfgZAFp0IqQ6WIc-pZki83kjpViwptlcsiV0-Ij3HI6AJxhOTE4jsjNOoZOHZI3823twg8yZWXYx"
        search_term = 'term'
        parameters = {'limit': 50, 'location': api_msg[1]}
        api = SearchApi(client_id=client_id,
                        client_secret=client_secret,
                        top_k=300,
                        delay=5,
                        search_term=search_term,
                        **parameters)
        sample_file = "%s/netdisk/yelp_sample_%s.pkl" % (settings.BASE_DIR,
                                                         api_msg[1])
        sampledata = SampleData(sample_ratio=0.5,
                                samplepath=sample_file,
                                filetype='pkl',
                                uniqueid="business_id",
                                querylist=["name"])
        hiddendata = HiddenData(
            uniqueid="id", matchlist=["name", "location.display_address.*"])
        if "id" in hidden_match:
            localdata = LocalData(
                uniqueid=local_match[hidden_match.index("id")],
                querylist=[local_match[hidden_match.index("name")]],
                matchlist=[
                    local_match[hidden_match.index("name")], local_match[
                        hidden_match.index("location.display_address.*")]
                ],
                data_raw=original_csv)
        else:
            uniqueID = 'AutoID'
            original_csv[0].append(uniqueID)
            for i in range(1, len(original_csv)):
                original_csv[i].append(i)
            localdata = LocalData(
                uniqueid=uniqueID,
                querylist=[local_match[hidden_match.index("name")]],
                matchlist=[
                    local_match[hidden_match.index("name")], local_match[
                        hidden_match.index("location.display_address.*")]
                ],
                data_raw=original_csv)
    elif 'google Place API' in api_msg[0]:
        search_term = 'query'
        parameters = {'key': 'AIzaSyDhBJSPqHfcEkPGQGbH7l3eWyF_PhF10iw'}
        api = TextSearchApi(location='in+' + api_msg[1],
                            top_k=60,
                            delay=5,
                            search_term=search_term,
                            **parameters)
        sample_file = "%s/netdisk/yelp_sample_%s.pkl" % (settings.BASE_DIR,
                                                         api_msg[1])
        sampledata = SampleData(sample_ratio=0.5,
                                samplepath=sample_file,
                                filetype='pkl',
                                uniqueid="business_id",
                                querylist=["name"])
        hiddendata = HiddenData(uniqueid="place_id",
                                matchlist=["name", "formatted_address"])
        if "place_id" in hidden_match:
            localdata = LocalData(
                uniqueid=local_match[hidden_match.index("place_id")],
                querylist=[local_match[hidden_match.index("name")]],
                matchlist=[
                    local_match[hidden_match.index("name")],
                    local_match[hidden_match.index("formatted_address")]
                ],
                data_raw=original_csv)
        else:
            uniqueID = 'AutoID'
            original_csv[0].append(uniqueID)
            for i in range(1, len(original_csv)):
                original_csv[i].append(i)
            localdata = LocalData(
                uniqueid=uniqueID,
                querylist=[local_match[hidden_match.index("name")]],
                matchlist=[
                    local_match[hidden_match.index("name")],
                    local_match[hidden_match.index("formatted_address")]
                ],
                data_raw=original_csv)

    SmartCrawl(budget, api, sampledata, localdata, hiddendata)
    localdata_csv = localdata.getRawData()
    crawldata_csv = Json2csv(hiddendata.getMergeResult()).getCsvdata()

    result = {}
    result['smart_queries'] = hiddendata.getQueryList()
    result['record'] = []
    if 'header' in crawldata_csv:
        result['local_header'] = localdata_csv['header']
        result['hidden_header'] = crawldata_csv['header']
        try:
            matchpair = sorted(hiddendata.getMatchPair().items(),
                               key=lambda item: int(item[0]),
                               reverse=False)
        except (ValueError):
            matchpair = sorted(hiddendata.getMatchPair().items(),
                               key=lambda item: item[0],
                               reverse=False)
        for m in matchpair:
            temp_record = []
            local_id = m[0]
            temp_record.append(localdata_csv[local_id])
            for hidden_id in m[1]:
                temp_record.append(crawldata_csv[hidden_id])
            result['record'].append(temp_record)
    else:
        result['local_header'] = localdata_csv['header']

    if 'dblp Publ API' in api_msg[0]:
        search_term = 'q'
        parameters = {'h': 1}
        api = PublApi(top_k=1, delay=5, search_term=search_term, **parameters)
        hiddendata = HiddenData(uniqueid="info.key", matchlist=["info.title"])
    elif 'aminer Publ API' in api_msg[0]:
        search_term = 'term'
        parameters = {'size': 1, 'sort': 'relevance'}
        api = AdvancedPublApi(top_k=1,
                              delay=5,
                              search_term=search_term,
                              **parameters)
        hiddendata = HiddenData(uniqueid="id", matchlist=["title"])
    elif 'yelp Search API' in api_msg[0]:
        client_id = "QhqrWe9agsd0Ad6Gs0qgMQ"
        client_secret = "6WQWRMV8edOhaThyWgm96wAJkIzJ1pHOhm5N0AD20edrnzv0lwi3wfgZAFp0IqQ6WIc-pZki83kjpViwptlcsiV0-Ij3HI6AJxhOTE4jsjNOoZOHZI3823twg8yZWXYx"
        search_term = 'term'
        parameters = {'limit': 1, 'location': 'AZ'}
        api = SearchApi(client_id=client_id,
                        client_secret=client_secret,
                        top_k=1,
                        delay=5,
                        search_term=search_term,
                        **parameters)
        hiddendata = HiddenData(
            uniqueid="id", matchlist=["name", "location.display_address.*"])
    elif 'google Place API' in api_msg[0]:
        search_term = 'query'
        parameters = {'key': 'AIzaSyDhBJSPqHfcEkPGQGbH7l3eWyF_PhF10iw'}
        api = TextSearchApi(location='in+' + api_msg[1],
                            top_k=60,
                            delay=5,
                            search_term=search_term,
                            **parameters)
        hiddendata = HiddenData(uniqueid="place_id",
                                matchlist=["name", "formatted_address"])
    result['naive'] = NaiveCrawl(budget, api, localdata, hiddendata, typo_ids)
    result['naive_queries'] = hiddendata.getQueryList()
    return result
Example #9
0
class YelpSearchapiTestCase(unittest.TestCase):
    def setUp(self):
        client_id = "kCe2YbZePXsPnC204ZrXoQ"
        client_secret = "s9KnvEEQW7jaA2wlrBi4X2fnDQ0F7asdklXVvJUidWp8i50ov24E8EjkHX2AUhoL"
        search_term = 'term'
        parameters = {'limit': 50, 'location': 'AZ'}
        self.yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=1000, delay=5,
                              search_term=search_term,
                              **parameters)

    def tearDown(self):
        self.yelp = None

    def test_callApi(self):
        query = ['tai', 'restaurant']
        params = self.yelp.getKwargs()
        params[self.yelp.getSearchTerm()] = '+'.join(query)
        params['offset'] = 0
        results = self.yelp.callAPI(params)
        assert len(results) >= 20

    def test_callMulApi_term(self):
        queries = [['tai'], ['restaurant']]
        results = self.yelp.callMulAPI(queries)
        assert len(results) >= 1000

    def test_callMulApi_categories(self):
        self.yelp.setSearchTerm('categories')
        categories = [['bars'], ['french']]
        results = self.yelp.callMulAPI(categories)
        print len(results)
        assert len(results) >= 700

    def test_callMulApi_sort(self):
        self.yelp.setSearchTerm('sort_by')
        sort_by = [['rating'], ['best_match']]
        results = self.yelp.callMulAPI(sort_by)
        self.yelp.getSession().close()
        assert len(results) >= 1000