コード例 #1
0
 def __init__(self, redis_que):
     self.crud = MongoCRUD()
     self.keys_count = app_keys
     self.keys = app_keys.keys()
     self.key_begin = self.app_keys_pop()
     # self.key = app_keys_count(self.key_begin)
     self.redis_queue = redis_que
コード例 #2
0
ファイル: crawler.py プロジェクト: vickyi/koala
class Extractor(Worker):
    def __init__(self, exit_flag, job_queue, done_queue, api_key_manager):
        Worker.__init__(self, exit_flag, job_queue, done_queue, -1)
        self.crud = MongoCRUD()

    def do_job(self, job):
        if job.data["status"] == 1:
            json_data = job.data["page_json"]
            results = json_data["result"]
            self.crud.save_map_data_insert(results)
            if "next_page_token" in json_data:
                page_token = json_data["next_page_token"]
                next_url = JobURL.next_url(job.data["base_url"], page_token)
                page = job.page_idx + 1
                new_job = GooglePOIJob(next_url, page)
                job.new_urls.append(new_job)
コード例 #3
0
class Extractor(Worker):
    def __init__(self, exit_flag, job_queue, done_queue, api_key_manager):
        Worker.__init__(self, exit_flag, job_queue, done_queue, -1)
        self.crud = MongoCRUD()

    def do_job(self, job):
        if job.data['status'] == 1:
            json_data = job.data['page_json']
            results = json_data['result']
            self.crud.save_map_data_insert(results)
            if 'next_page_token' in json_data:
                page_token = json_data['next_page_token']
                next_url = JobURL.next_url(job.data['base_url'], page_token)
                page = job.page_idx + 1
                new_job = GooglePOIJob(next_url, page)
                job.new_urls.append(new_job)
コード例 #4
0
ファイル: spider.py プロジェクト: MeTrina/RedisQueue
 def __init__(self,redis_que):
     self.crud = MongoCRUD()
     self.keys_count = app_keys
     self.keys = app_keys.keys()
     self.key_begin = self.app_keys_pop()
     # self.key = app_keys_count(self.key_begin)
     self.redis_queue = redis_que
コード例 #5
0
ファイル: crawler.py プロジェクト: vickyi/koala
 def __init__(self, exit_flag, job_queue, done_queue, api_key_manager):
     Worker.__init__(self, exit_flag, job_queue, done_queue, -1)
     self.crud = MongoCRUD()
コード例 #6
0
def api_crud(audi_file_type=None, audi_file_id=None):
    try:
        if audi_file_type not in VALID_FORMAT:
            raise UnsupportedRequestError("Only formats {} are allowed".format(
                ", ".join(VALID_FORMAT)))
        request_method = request.method

        if request_method == "GET":
            data_obj = MongoCRUD(audi_file_type, id=audi_file_id)
            return jsonify(data_obj.read())

        if request_method == "DELETE":
            data_obj = MongoCRUD(audi_file_type, id=audi_file_id)
            return jsonify(data_obj.delete())

        if request_method == "PUT":
            data = request.json
            data_obj = MongoCRUD(audi_file_type, id=audi_file_id, data=data)
            return jsonify(data_obj.update())

        if request_method == "POST":
            data = request.json
            data_obj = MongoCRUD(audi_file_type, id=audi_file_id, data=data)
            return jsonify(data_obj.create())

    except UnsupportedRequestError as error:
        return {"result": {"error": error.message}}, 422

    except Exception as exc:
        return {"result": {"error": str(exc)}}, 500
コード例 #7
0
class GooglePlacesParser():
    def __init__(self, redis_que):
        self.crud = MongoCRUD()
        self.keys_count = app_keys
        self.keys = app_keys.keys()
        self.key_begin = self.app_keys_pop()
        # self.key = app_keys_count(self.key_begin)
        self.redis_queue = redis_que

    def app_keys_pop(self):
        if len(self.keys) > 0:
            key = self.keys.pop()
            return key
        else:
            print "*------*-*all app keys have been used*-*-----*"
            sys.exit()

    def app_keys_count(self, key):
        count = self.keys_count[key]
        if count > 999:
            key = self.app_keys_pop()
            count = self.keys_count[key]
        count = count + 1
        self.keys_count[key] = count
        self.key_begin = key
        print self.key_begin
        print count
        return key

    def change_radius(self):
        radius = 500
        return radius

    def change_language(self):
        language = 'zh-TW'
        return language

    def get_url(self, location, type):
        print '*********'
        url = 'https://maps.googleapis.com/maps/api/place/search/json?sensor=false'
        url += '&language=%s' % self.change_language()
        url += '&location=' + '%s,%s' % (location['lat'], location['lng'])
        url += '&radius=%s' % self.change_radius()  # 500 m
        url += '&types=%s' % '|'.join(type)
        url += '&key=%s' % self.app_keys_count(self.key_begin)
        url += '&pagetoken='
        return url

    def save_url_ToQueue(self):
        all_locations = self.crud.read_all_locations()
        if len(self.keys) > 0:
            for location in all_locations:
                for type in types:
                    url = self.get_url(location, type)
                    print url
                    self.redis_queue.put(url)
                self.crud.update_location_status(location['_id'])
        else:
            print "*------*-*all app keys have been used*-*-----*"
            sys.exit()

    def parse_html(self, url):
        # Show the source
        time.sleep(2)
        br = Browser(url)
        josn_response = br.get_html()
        status = josn_response['status']
        if status == 'OK':
            results = josn_response['results']
            # insert to mongo
            self.crud.save_map_data_insert(results)
            if 'next_page_token' in josn_response:
                pagetoken = '&pagetoken=%s' % josn_response['next_page_token']
                url = re.sub(r'&pagetoken=.*', pagetoken, url)
                save_url_ToQueue(url)
                # self.parse_html(url)
            else:
                pass
        elif status == 'OVER_QUERY_LIMIT':
            self.key = self.app_keys_pop()
            url = re.sub(r'&key=.*&pagetoken', '&key=%s&pagetoken' % self.key,
                         url)
            save_url_ToQueue(url)
            # self.parse_html(url)
        else:
            return
コード例 #8
0
ファイル: eu_queue.py プロジェクト: MeTrina/RedisQueue
class EuQueue(object):
    def __init__(self,name,redis_que):
        self.crud = MongoCRUD()
        self.keys_count = app_keys
        self.keys = app_keys.keys()
        self.key_begin = self.app_keys_pop()
        # self.key = app_keys_count(self.key_begin)
        self.redis_queue = redis_que
        self.name = name
        self.db = redis.Redis()

    def app_keys_pop(self):
        if len(self.keys) > 0:
            key = self.keys.pop()
            return key
        else:
            print "*------*-*all app keys have been used*-*-----*"
            sys.exit()
    def app_keys_count(self, key):
        count  = self.keys_count[key]
        if count > 999:
            key = self.app_keys_pop()
            count  = self.keys_count[key]
        count  = count + 1
        self.keys_count[key] = count
        self.key_begin = key
        print self.key_begin
        print count
        return key


    def change_radius(self):
        radius = 500
        return radius

    def change_language(self):
        language = 'zh-TW'
        return language

    def get_url(self, location, type):
        print '*********'
        url = 'https://maps.googleapis.com/maps/api/place/search/json?sensor=false'
        url += '&language=%s' % self.change_language()
        url += '&location=' + '%s,%s' % (location['lat'], location['lng'])
        url += '&radius=%s' % self.change_radius() # 500 m
        url += '&types=%s' % '|'.join(type)
        url += '&key=%s' % self.app_keys_count(self.key_begin)
        url += '&pagetoken='
        return url
    def save_html(self, item):
        print "save html into eu_queue:", item
        self.db.rpush(self.name, item)
    def get_html(self):
        item = self.db.lpop(self.name)
        return item
    def parse_html_save(self, item):
    # Show the source
        time.sleep(2)
        josn_response = self.get_html()
        status = item['status']
        if status == 'OK':
            results = josn_response['results']
            # insert to mongo
            self.crud.save_html_insert(results)
            if 'next_page_token' in item:
                pagetoken = '&pagetoken=%s' % josn_response['next_page_token']
                url = re.sub(r'&pagetoken=.*', pagetoken, url)
                self.parse_html(url)
            else:
                pass
        elif status == 'OVER_QUERY_LIMIT':
            self.key = self.app_keys_pop()
            url = re.sub(r'&key=.*&pagetoken', '&key=%s&pagetoken' % self.key, url)
            self.parse_html(url)
        else:
            return
        '''
コード例 #9
0
ファイル: spider.py プロジェクト: vickyi/koala
 def __init__(self):
     self.crud = MongoCRUD()
     self.keys = app_keys
     self.key = self.app_keys_pop()
コード例 #10
0
ファイル: spider.py プロジェクト: vickyi/koala
class GooglePlacesParser():
    def __init__(self):
        self.crud = MongoCRUD()
        self.keys = app_keys
        self.key = self.app_keys_pop()

    def app_keys_pop(self):
        if len(self.keys) > 0:
            key = self.keys.pop()
            return key
        else:
            print "*------*-*all app keys have been used*-*-----*"
            sys.exit()

    def change_radius(self):
        radius = 500
        return radius

    def change_language(self):
        language = 'zh-TW'
        return language

    def get_url(self, location, type):
        url = 'https://maps.googleapis.com/maps/api/place/search/json?sensor=false'
        url += '&language=%s' % self.change_language()
        url += '&location=' + '%s,%s' % (location['lat'], location['lng'])
        url += '&radius=%s' % self.change_radius() # 500 m
        url += '&types=establishment|%s' % '|'.join(type)
        url += '&key=%s' % self.key
        url += '&pagetoken='
        return url

    def run(self):
        all_locations = self.crud.read_all_locations()
        if len(self.keys) > 0:
            for location in all_locations:
                for type in types:
                    url = self.get_url(location, type)
                    print url
                    self.parse_html(url)
                self.crud.update_location_status(location['_id'])
        else:
            print "*------*-*all app keys have been used*-*-----*"
            sys.exit()

    def parse_html(self, url):
        # Show the source
        time.sleep(2)
        br = Browser(url)
        josn_response = br.get_html()
        status = josn_response['status']
        if status == 'OK':
            results = josn_response['results']
            # insert to mongo
            self.crud.save_map_data_insert(results)
            if 'next_page_token' in josn_response:
                pagetoken = '&pagetoken=%s' % josn_response['next_page_token']
                url = re.sub(r'&pagetoken=.*', pagetoken, url)
                self.parse_html(url)
            else:
                pass
        elif status == 'OVER_QUERY_LIMIT':
            self.key = self.app_keys_pop()
            url = re.sub(r'&key=.*&pagetoken', '&key=%s&pagetoken' % self.key, url)
            self.parse_html(url)
        else:
            return
コード例 #11
0
ファイル: LatLngInit.py プロジェクト: MeTrina/koala
 def save(self, results):
     mcrud = MongoCRUD()
     mcrud.save_circle_centers(results)
コード例 #12
0
 def __init__(self, exit_flag, job_queue, done_queue, api_key_manager):
     Worker.__init__(self, exit_flag, job_queue, done_queue, -1)
     self.crud = MongoCRUD()
コード例 #13
0
ファイル: LatLngInit.py プロジェクト: dong/RedisQueue
 def save(self, results):
     mcrud = MongoCRUD()
     mcrud.save_circle_centers(results)
コード例 #14
0
class EuQueue(object):
    def __init__(self, name, redis_que):
        self.crud = MongoCRUD()
        self.keys_count = app_keys
        self.keys = app_keys.keys()
        self.key_begin = self.app_keys_pop()
        # self.key = app_keys_count(self.key_begin)
        self.redis_queue = redis_que
        self.name = name
        self.db = redis.Redis()

    def app_keys_pop(self):
        if len(self.keys) > 0:
            key = self.keys.pop()
            return key
        else:
            print "*------*-*all app keys have been used*-*-----*"
            sys.exit()

    def app_keys_count(self, key):
        count = self.keys_count[key]
        if count > 999:
            key = self.app_keys_pop()
            count = self.keys_count[key]
        count = count + 1
        self.keys_count[key] = count
        self.key_begin = key
        print self.key_begin
        print count
        return key

    def change_radius(self):
        radius = 500
        return radius

    def change_language(self):
        language = 'zh-TW'
        return language

    def get_url(self, location, type):
        print '*********'
        url = 'https://maps.googleapis.com/maps/api/place/search/json?sensor=false'
        url += '&language=%s' % self.change_language()
        url += '&location=' + '%s,%s' % (location['lat'], location['lng'])
        url += '&radius=%s' % self.change_radius()  # 500 m
        url += '&types=%s' % '|'.join(type)
        url += '&key=%s' % self.app_keys_count(self.key_begin)
        url += '&pagetoken='
        return url

    def save_html(self, item):
        print "save html into eu_queue:", item
        self.db.rpush(self.name, item)

    def get_html(self):
        item = self.db.lpop(self.name)
        return item

    def parse_html_save(self, item):
        # Show the source
        time.sleep(2)
        josn_response = self.get_html()
        status = item['status']
        if status == 'OK':
            results = josn_response['results']
            # insert to mongo
            self.crud.save_html_insert(results)
            if 'next_page_token' in item:
                pagetoken = '&pagetoken=%s' % josn_response['next_page_token']
                url = re.sub(r'&pagetoken=.*', pagetoken, url)
                self.parse_html(url)
            else:
                pass
        elif status == 'OVER_QUERY_LIMIT':
            self.key = self.app_keys_pop()
            url = re.sub(r'&key=.*&pagetoken', '&key=%s&pagetoken' % self.key,
                         url)
            self.parse_html(url)
        else:
            return
        '''