Esempio n. 1
0
class Extractor(Worker):
    def __init__(self, exit_flag, job_queue, done_queue, api_key_manager):
        Worker.__init__(self, exit_flag, job_queue, done_queue, -1)
        self.crud = MongoCRUD()

    def do_job(self, job):
        if job.data["status"] == 1:
            json_data = job.data["page_json"]
            results = json_data["result"]
            self.crud.save_map_data_insert(results)
            if "next_page_token" in json_data:
                page_token = json_data["next_page_token"]
                next_url = JobURL.next_url(job.data["base_url"], page_token)
                page = job.page_idx + 1
                new_job = GooglePOIJob(next_url, page)
                job.new_urls.append(new_job)
Esempio n. 2
0
class Extractor(Worker):
    def __init__(self, exit_flag, job_queue, done_queue, api_key_manager):
        Worker.__init__(self, exit_flag, job_queue, done_queue, -1)
        self.crud = MongoCRUD()

    def do_job(self, job):
        if job.data['status'] == 1:
            json_data = job.data['page_json']
            results = json_data['result']
            self.crud.save_map_data_insert(results)
            if 'next_page_token' in json_data:
                page_token = json_data['next_page_token']
                next_url = JobURL.next_url(job.data['base_url'], page_token)
                page = job.page_idx + 1
                new_job = GooglePOIJob(next_url, page)
                job.new_urls.append(new_job)
Esempio n. 3
0
class GooglePlacesParser():
    def __init__(self, redis_que):
        self.crud = MongoCRUD()
        self.keys_count = app_keys
        self.keys = app_keys.keys()
        self.key_begin = self.app_keys_pop()
        # self.key = app_keys_count(self.key_begin)
        self.redis_queue = redis_que

    def app_keys_pop(self):
        if len(self.keys) > 0:
            key = self.keys.pop()
            return key
        else:
            print "*------*-*all app keys have been used*-*-----*"
            sys.exit()

    def app_keys_count(self, key):
        count = self.keys_count[key]
        if count > 999:
            key = self.app_keys_pop()
            count = self.keys_count[key]
        count = count + 1
        self.keys_count[key] = count
        self.key_begin = key
        print self.key_begin
        print count
        return key

    def change_radius(self):
        radius = 500
        return radius

    def change_language(self):
        language = 'zh-TW'
        return language

    def get_url(self, location, type):
        print '*********'
        url = 'https://maps.googleapis.com/maps/api/place/search/json?sensor=false'
        url += '&language=%s' % self.change_language()
        url += '&location=' + '%s,%s' % (location['lat'], location['lng'])
        url += '&radius=%s' % self.change_radius()  # 500 m
        url += '&types=%s' % '|'.join(type)
        url += '&key=%s' % self.app_keys_count(self.key_begin)
        url += '&pagetoken='
        return url

    def save_url_ToQueue(self):
        all_locations = self.crud.read_all_locations()
        if len(self.keys) > 0:
            for location in all_locations:
                for type in types:
                    url = self.get_url(location, type)
                    print url
                    self.redis_queue.put(url)
                self.crud.update_location_status(location['_id'])
        else:
            print "*------*-*all app keys have been used*-*-----*"
            sys.exit()

    def parse_html(self, url):
        # Show the source
        time.sleep(2)
        br = Browser(url)
        josn_response = br.get_html()
        status = josn_response['status']
        if status == 'OK':
            results = josn_response['results']
            # insert to mongo
            self.crud.save_map_data_insert(results)
            if 'next_page_token' in josn_response:
                pagetoken = '&pagetoken=%s' % josn_response['next_page_token']
                url = re.sub(r'&pagetoken=.*', pagetoken, url)
                save_url_ToQueue(url)
                # self.parse_html(url)
            else:
                pass
        elif status == 'OVER_QUERY_LIMIT':
            self.key = self.app_keys_pop()
            url = re.sub(r'&key=.*&pagetoken', '&key=%s&pagetoken' % self.key,
                         url)
            save_url_ToQueue(url)
            # self.parse_html(url)
        else:
            return
Esempio n. 4
0
class GooglePlacesParser():
    def __init__(self):
        self.crud = MongoCRUD()
        self.keys = app_keys
        self.key = self.app_keys_pop()

    def app_keys_pop(self):
        if len(self.keys) > 0:
            key = self.keys.pop()
            return key
        else:
            print "*------*-*all app keys have been used*-*-----*"
            sys.exit()

    def change_radius(self):
        radius = 500
        return radius

    def change_language(self):
        language = 'zh-TW'
        return language

    def get_url(self, location, type):
        url = 'https://maps.googleapis.com/maps/api/place/search/json?sensor=false'
        url += '&language=%s' % self.change_language()
        url += '&location=' + '%s,%s' % (location['lat'], location['lng'])
        url += '&radius=%s' % self.change_radius() # 500 m
        url += '&types=establishment|%s' % '|'.join(type)
        url += '&key=%s' % self.key
        url += '&pagetoken='
        return url

    def run(self):
        all_locations = self.crud.read_all_locations()
        if len(self.keys) > 0:
            for location in all_locations:
                for type in types:
                    url = self.get_url(location, type)
                    print url
                    self.parse_html(url)
                self.crud.update_location_status(location['_id'])
        else:
            print "*------*-*all app keys have been used*-*-----*"
            sys.exit()

    def parse_html(self, url):
        # Show the source
        time.sleep(2)
        br = Browser(url)
        josn_response = br.get_html()
        status = josn_response['status']
        if status == 'OK':
            results = josn_response['results']
            # insert to mongo
            self.crud.save_map_data_insert(results)
            if 'next_page_token' in josn_response:
                pagetoken = '&pagetoken=%s' % josn_response['next_page_token']
                url = re.sub(r'&pagetoken=.*', pagetoken, url)
                self.parse_html(url)
            else:
                pass
        elif status == 'OVER_QUERY_LIMIT':
            self.key = self.app_keys_pop()
            url = re.sub(r'&key=.*&pagetoken', '&key=%s&pagetoken' % self.key, url)
            self.parse_html(url)
        else:
            return