def start_bootstrapping(self):
        """
        Main Method - Iterates over all categories, keywords,
                      and misc words, scrapes the urls of all the
                      search results and store them into mongodb
        """

        # Arguments Parsing
        args_parser = self.get_arguments_parser()
        self._args = vars(args_parser.parse_args())

        # Log Handler Configuring
        self._logger = Utils.configure_log(self._args)

        # MongoDB Configuring
        if not Utils.configure_mongodb(self,**self._params):
            self._logger.fatal('Error configuring MongoDB')
            sys.exit(errno.ECONNREFUSED)

        # Loads different "seed" terms from the input xml file received
        bs_seed = BootstrappingSeed.Seed(self._args['bootstrapping-terms'])
        bs_seed.initialize_seed_class()

        # Checking for proxies
        if self._args['proxies_path']:
            self._proxies = Utils.load_proxies(self._args)
            print 'Loaded Proxies : %d' % len(self._proxies)
        else:
            self._proxies = None

        # if "Debug Http" is set to true, "verify" must be "false"
        self._verify_certificate = not self._args['debug_https']

        # Request for each top level category
        for top_level_category in bs_seed._top_level_categories:
            self.crawl_category(top_level_category)

        # Simulating searches for specific words
        for word in bs_seed.get_words():
            self.crawl_by_search_word(word)
Example #2
0
    def start_bootstrapping(self):
        """
        Main Method - Iterates over all categories, keywords,
                      and misc words, scrapes the urls of all the
                      search results and store them into mongodb
        """

        # Arguments Parsing
        args_parser = self.get_arguments_parser()
        self._args = vars(args_parser.parse_args())

        # Log Handler Configuring
        self._logger = Utils.configure_log(self._args)

        # MongoDB Configuring
        if not Utils.configure_mongodb(self, **self._params):
            self._logger.fatal('Error configuring MongoDB')
            sys.exit(errno.ECONNREFUSED)

        # Loads different "seed" terms from the input xml file received
        bs_seed = BootstrappingSeed.Seed(self._args['bootstrapping-terms'])
        bs_seed.initialize_seed_class()

        # Checking for proxies
        if self._args['proxies_path']:
            self._proxies = Utils.load_proxies(self._args)
            print 'Loaded Proxies : %d' % len(self._proxies)
        else:
            self._proxies = None

        # if "Debug Http" is set to true, "verify" must be "false"
        self._verify_certificate = not self._args['debug_https']

        # Request for each top level category
        for top_level_category in bs_seed._top_level_categories:
            self.crawl_category(top_level_category)

        # Simulating searches for specific words
        for word in bs_seed.get_words():
            self.crawl_by_search_word(word)
    def scrape_apps(self):
        """
        Main method of the 'Worker' layer of this project.

        This method starts the distributed working phase which will
        consume urls from the seed database and scrape apps data out
        of the html pages, storing the result into the
        apps_data collection on MongoDB
        """

        # Arguments Parsing
        args_parser = self.get_arguments_parser()
        self._args = vars(args_parser.parse_args())

        # Log Handler Configuring
        self._logger = Utils.configure_log(self._args)

        # MongoDB Configuring
        if not Utils.configure_mongodb(self,**self._params):
            self._logger.fatal('Error configuring MongoDB')
            sys.exit(errno.ECONNREFUSED)

        # Making sure indexes exist
        self._mongo_wrapper.ensure_index('IsBusy');
        self._mongo_wrapper.ensure_index('_id', self._params['apps_collection'])

        # Proxies Loading
        self._proxies = Utils.load_proxies(self._args)

        # if "Debug Http" is set to true, "verify" must be "false"
        self._verify_certificate = not self._args['debug_https']
        self._is_using_proxies = self._proxies != None

        # Control Variables - Used on the 'retrying logic'
        retries, max_retries = 0, 8

        parser = html_parser()

        # Loop only breaks when there are no more apps to be processed
        while True:

            # Finds an app to be processed and toggles it's state to 'Busy'
            seed_record = self._mongo_wrapper.find_and_modify()

            if not seed_record:
                break

            try:
                url = seed_record['_id']

                # Do we need to normalize the url ?
                if 'http://' not in url and 'https://' not in url:
                    url = 'https://play.google.com' + url

                self._logger.info('Processing: %s' % url)

                # Is this app processed already ?
                if self._mongo_wrapper.app_processed(url, self._params['apps_collection']):

                    self._logger.info('Duplicated App : %s. Skipped' % url)
                    self._mongo_wrapper.remove_app_from_queue(seed_record)
                    continue

                # Get Request for the App's Page
                response = requests.get(url,
                                        HTTPUtils.headers,
                                        verify=self._verify_certificate,
                                        proxies=Utils.get_proxy(self))

                # Sanity Checks on Response
                if not response.text or response.status_code != requests.codes.ok:
                    self._logger.info('Error Opening App Page : %s' % url)

                    retries += 1

                    # Retries logic are different if proxies are being used
                    if self._is_using_proxies:
                        Utils.sleep()
                try:
                    # Scraping Data from HTML
                    app = parser.parse_app_data(response.text)

                    # Stamping URL into app model
                    app['Url'] = url
                    app['_id'] = url

                    # Reaching related apps
                    related_apps = parser.parse_related_apps(response.text)

                    if not related_apps:
                        app['RelatedUrls'] = None
                    else:
                        app['RelatedUrls'] = related_apps
                        self._logger.info('Related Apps: %s - %d' % (url, len(related_apps)))

                    # Inserting data into MongoDB
                    self._mongo_wrapper._insert(app, self._params['apps_collection'])

                    # Re-Feeding seed collection with related-app urls
                    if app['RelatedUrls']:
                        for url in app['RelatedUrls']:
                            if not self._mongo_wrapper.app_processed(url, self._params['apps_collection']) and \
                               not self._mongo_wrapper.app_processed(url, self._params['seed_collection']):
                                self._mongo_wrapper.insert_on_queue(url, self._params['seed_collection'])

                except Exception as exception:
                    self._logger.error(exception)

                    # Toggling app state back to false
                    self._mongo_wrapper.toggle_app_busy(url,False, self._params['seed_collection'])

            except Exception as exception:
                self._logger.error(exception)
Example #4
0
    def crawl_by_search_word(self, word):
        """
        Simulates an app search on the play store, using the
        word argument as the term to be searched.

        Paginates through all the results found and
        store the unique urls into the MongoDB seed
        collection
        """

        self._logger.info('Scraping links of Word : %s' % word)
        parsed_urls = set()

        # Compiling regex used for parsing page token
        page_token_regex = regex.compile(r"GAEi+.+\:S\:.{11}\\42,")

        post_url = self.assemble_post_url(word)
        post_data = self.assemble_word_search_post_data()

        http_errors = 0
        while http_errors <= self._args['max_errors']:

            try:
                response = requests.post(post_url,
                                         data=post_data,
                                         headers=HTTPUtils.headers,
                                         verify=self._verify_certificate,
                                         proxies=Utils.get_proxy())

                if response.status_code != requests.codes.ok:
                    http_errors += 1
                    #Utils.sleep(http_errors)
                    self._logger.critical('Error [%d] on Response for : %s' %
                                          (response.status_code, word))
                else:
                    for url in self.parse_app_urls(response.text):
                        self._mongo_wrapper.insert_on_queue(url)
                        parsed_urls.add(url)

                    break  # Response worked

            except requests.exceptions.SSLError as error:
                print 'SSL_Error : ' + error.errno

        # Paging Through Results
        while http_errors <= self._args['max_errors']:

            page_token = page_token_regex.search(response.text)

            if not page_token:
                self._logger.fatal("Couldn't find page token")
                break

            page_token = self.normalize_page_token(page_token.group())
            post_data = self.assemble_word_search_post_data(page_token)

            try:
                response = requests.post(post_url,
                                         data=post_data,
                                         headers=HTTPUtils.headers,
                                         verify=self._verify_certificate,
                                         proxies=Utils.get_proxy())

                if response.status_code != requests.codes.ok:
                    http_errors += 1
                    #Utils.sleep(http_errors)
                    self._logger.critical('Error [%d] on Response for : %s' %
                                          (response.status_code, word))
                else:
                    for url in self.parse_app_urls(response.text):
                        if url in parsed_urls:
                            return

                        self._mongo_wrapper.insert_on_queue(url)
                        parsed_urls.add(url)
                        #Utils.sleep()

            except requests.exceptions.SSLError as error:
                print 'SSL_Error : ' + error.errno
Example #5
0
    def crawl_category(self, category):
        """
        Executes a GET request for the url of the category received.

        Paginates through all the results found and
        store the unique urls into the MongoDB seed
        collection
        """

        category_url = category[1]
        category_name = category[0]

        self._logger.info('Scraping links of Category : %s' % category_name)
        parsed_urls = set()

        http_errors = 0
        while http_errors <= self._args['max_errors']:

            try:
                response = requests.get(category_url,
                                        HTTPUtils.headers,
                                        verify=self._verify_certificate,
                                        proxies=Utils.get_proxy())

                if response.status_code != requests.codes.ok:
                    http_errors += 1
                    #Utils.sleep(http_errors)
                    self._logger.critical(
                        'Error [%d] on Response for : %s' %
                        (response.status_code, category_name))
                else:
                    for url in self.parse_app_urls(response.text):
                        self._mongo_wrapper.insert_on_queue(url)
                        parsed_urls.add(url)

                    break  # Response worked

            except requests.exceptions.SSLError as error:
                print 'SSL_Error : ' + error.errno

        # Paging through results
        base_skip = 60
        current_multiplier = 1

        while http_errors <= self._args['max_errors']:

            post_data = self.assemble_category_post_data(
                current_multiplier, base_skip)
            try:
                response = requests.post(category_url + '?authuser=0',
                                         data=post_data,
                                         headers=HTTPUtils.headers,
                                         verify=self._verify_certificate,
                                         proxies=Utils.get_proxy())

                if response.status_code != requests.codes.ok:
                    http_errors += 1
                    #Utils.sleep(http_errors)
                    self._logger.critical(
                        'Error [%d] on Response for : %s' %
                        (response.status_code, category_name))
                else:
                    for url in self.parse_app_urls(response.text):
                        if url in parsed_urls:
                            return

                        parsed_urls.add(url)
                        self._mongo_wrapper.insert_on_queue(url)
                        #Utils.sleep()

            except requests.exceptions.SSLError as error:
                print 'SSL_Error : ' + error.errno

            current_multiplier += 1
    def crawl_by_search_word(self, word):
        """
        Simulates an app search on the play store, using the
        word argument as the term to be searched.

        Paginates through all the results found and
        store the unique urls into the MongoDB seed
        collection
        """

        self._logger.info('Scraping links of Word : %s' % word)
        parsed_urls = set()

        # Compiling regex used for parsing page token
        page_token_regex = regex.compile(r"GAEi+.+\:S\:.{11}\\42,")

        post_url = self.assemble_post_url(word)
        post_data = self.assemble_word_search_post_data()

        http_errors = 0
        while http_errors <= self._args['max_errors']:

            try:
                response = requests.post(post_url,
                                        data=post_data,
                                        headers=HTTPUtils.headers,
                                        verify=self._verify_certificate,
                                        proxies=Utils.get_proxy())

                if response.status_code != requests.codes.ok:
                    http_errors+=1
                    #Utils.sleep(http_errors)
                    self._logger.critical('Error [%d] on Response for : %s'
                                          % (response.status_code, word))
                else:
                    for url in self.parse_app_urls(response.text):
                        self._mongo_wrapper.insert_on_queue(url)
                        parsed_urls.add(url)

                    break # Response worked

            except requests.exceptions.SSLError as error:
                print 'SSL_Error : ' + error.errno

        # Paging Through Results
        while http_errors <= self._args['max_errors']:

            page_token = page_token_regex.search(response.text)

            if not page_token:
                self._logger.fatal("Couldn't find page token")
                break

            page_token = self.normalize_page_token(page_token.group())
            post_data = self.assemble_word_search_post_data(page_token)

            try:
                response = requests.post(post_url,
                                         data=post_data,
                                         headers=HTTPUtils.headers,
                                         verify=self._verify_certificate,
                                         proxies=Utils.get_proxy())

                if response.status_code != requests.codes.ok:
                    http_errors+=1
                    #Utils.sleep(http_errors)
                    self._logger.critical('Error [%d] on Response for : %s'
                                          % (response.status_code, word))
                else:
                    for url in self.parse_app_urls(response.text):
                        if url in parsed_urls:
                            return

                        self._mongo_wrapper.insert_on_queue(url)
                        parsed_urls.add(url)
                        #Utils.sleep()

            except requests.exceptions.SSLError as error:
                print 'SSL_Error : ' + error.errno
    def crawl_category(self, category):
        """
        Executes a GET request for the url of the category received.

        Paginates through all the results found and
        store the unique urls into the MongoDB seed
        collection
        """

        category_url = category[1]
        category_name = category[0]

        self._logger.info('Scraping links of Category : %s' % category_name)
        parsed_urls = set()

        http_errors = 0
        while http_errors <= self._args['max_errors']:

            try:
                response = requests.get(category_url,
                                        HTTPUtils.headers,
                                        verify=self._verify_certificate,
                                        proxies=Utils.get_proxy())

                if response.status_code != requests.codes.ok:
                    http_errors+=1
                    #Utils.sleep(http_errors)
                    self._logger.critical('Error [%d] on Response for : %s'
                                       % (response.status_code, category_name))
                else:
                    for url in self.parse_app_urls(response.text):
                        self._mongo_wrapper.insert_on_queue(url)
                        parsed_urls.add(url)

                    break # Response worked

            except requests.exceptions.SSLError as error:
                print 'SSL_Error : ' + error.errno

        # Paging through results
        base_skip = 60
        current_multiplier = 1

        while http_errors <= self._args['max_errors']:

            post_data = self.assemble_category_post_data(current_multiplier,
                                                         base_skip)
            try:
                response = requests.post(category_url + '?authuser=0',
                                         data = post_data,
                                         headers=HTTPUtils.headers,
                                         verify=self._verify_certificate,
                                         proxies=Utils.get_proxy())

                if response.status_code != requests.codes.ok:
                    http_errors+=1
                    #Utils.sleep(http_errors)
                    self._logger.critical('Error [%d] on Response for : %s'
                                      % (response.status_code, category_name))
                else:
                    for url in self.parse_app_urls(response.text):
                        if url in parsed_urls:
                            return

                        parsed_urls.add(url)
                        self._mongo_wrapper.insert_on_queue(url)
                        #Utils.sleep()

            except requests.exceptions.SSLError as error:
                print 'SSL_Error : ' + error.errno

            current_multiplier+=1
    def scrape_apps(self):
        """
        Main method of the 'Worker' layer of this project.

        This method starts the distributed working phase which will
        consume urls from the seed database and scrape apps data out
        of the html pages, storing the result into the
        apps_data collection on MongoDB
        """

        # Arguments Parsing
        args_parser = self.get_arguments_parser()
        self._args = vars(args_parser.parse_args())

        # Log Handler Configuring
        self._logger = Utils.configure_log(self._args)

        # MongoDB Configuring
        if not Utils.configure_mongodb(self, **self._params):
            self._logger.fatal('Error configuring MongoDB')
            sys.exit(errno.ECONNREFUSED)

        # Making sure indexes exist
        self._mongo_wrapper.ensure_index('IsBusy')
        self._mongo_wrapper.ensure_index('_id',
                                         self._params['apps_collection'])

        # Proxies Loading
        self._proxies = Utils.load_proxies(self._args)

        # if "Debug Http" is set to true, "verify" must be "false"
        self._verify_certificate = not self._args['debug_https']
        self._is_using_proxies = self._proxies != None

        # Control Variables - Used on the 'retrying logic'
        retries, max_retries = 0, 8

        parser = html_parser()

        # Loop only breaks when there are no more apps to be processed
        while True:

            # Finds an app to be processed and toggles it's state to 'Busy'
            seed_record = self._mongo_wrapper.find_and_modify()

            if not seed_record:
                break

            try:
                url = seed_record['_id']

                # Do we need to normalize the url ?
                if 'http://' not in url and 'https://' not in url:
                    url = 'https://play.google.com' + url

                self._logger.info('Processing: %s' % url)

                # Is this app processed already ?
                if self._mongo_wrapper.app_processed(
                        url, self._params['apps_collection']):

                    self._logger.info('Duplicated App : %s. Skipped' % url)
                    self._mongo_wrapper.remove_app_from_queue(seed_record)
                    continue

                # Get Request for the App's Page
                response = requests.get(url,
                                        HTTPUtils.headers,
                                        verify=self._verify_certificate,
                                        proxies=Utils.get_proxy(self))

                # Sanity Checks on Response
                if not response.text or response.status_code != requests.codes.ok:
                    self._logger.info('Error Opening App Page : %s' % url)

                    retries += 1

                    # Retries logic are different if proxies are being used
                    if self._is_using_proxies:
                        Utils.sleep()
                try:
                    # Scraping Data from HTML
                    app = parser.parse_app_data(response.text)

                    # Stamping URL into app model
                    app['Url'] = url
                    app['_id'] = url

                    # Reaching related apps
                    related_apps = parser.parse_related_apps(response.text)

                    if not related_apps:
                        app['RelatedUrls'] = None
                    else:
                        app['RelatedUrls'] = related_apps
                        self._logger.info('Related Apps: %s - %d' %
                                          (url, len(related_apps)))

                    # Inserting data into MongoDB
                    self._mongo_wrapper._insert(
                        app, self._params['apps_collection'])

                    # Re-Feeding seed collection with related-app urls
                    if app['RelatedUrls']:
                        for url in app['RelatedUrls']:
                            if not self._mongo_wrapper.app_processed(url, self._params['apps_collection']) and \
                               not self._mongo_wrapper.app_processed(url, self._params['seed_collection']):
                                self._mongo_wrapper.insert_on_queue(
                                    url, self._params['seed_collection'])

                except Exception as exception:
                    self._logger.error(exception)

                    # Toggling app state back to false
                    self._mongo_wrapper.toggle_app_busy(
                        url, False, self._params['seed_collection'])

            except Exception as exception:
                self._logger.error(exception)