Example #1
0
    def paginate_page(self, query):
        #------------------------------------------------------------------------
        # We are going to fetch all first 10 pages
        # Of google's result
        #------------------------------------------------------------------------
        start_url = f"https://www.google.com/search?q={query}&oq=&aqs=chrome.0.35i39l8.13736357j0j1&sourceid=chrome&ie=UTF-8"
        seek_number = 0
        seek_url_query = f"&start={seek_number}"

        # while self._page <= 9:
        while self._page <= self._stop_page:
            self._driver.get(start_url + f"&start={(self._page * 10)}")
            try:
                self.extract_page_content()
            except WebDriverException as e:
                println(
                    "Issue with selenium renderer most likely happened when taking screenshot",
                    "warn")
            except TimeoutException as e:
                println("Timeout error, check your internet", "warn")

            # Save extracted files into JSOn format after ever page is processed
            with open('extracted/' + str(self._page) + '.json',
                      'w') as outfile:
                json.dump(self._data_extract, outfile)
                outfile.close()

            self._page += 1
            self._data_extract = []
Example #2
0
  def screengrab(self, file_name: str):
   try:
    # Close every modal should any arise
    ActionChains(self._driver).send_keys(Keys.ESCAPE).perform()

    self._driver.find_element_by_tag_name('body').screenshot(file_name)

   except NoSuchElementException as e:
    println(f"We experienced an issue while trying to screenshot this website ({self._site_content['url']})", "warn")
Example #3
0
    def extract_info_from_link(self):
        #------------------------------------------------------------------------
        # We will access all the different websites, and
        # extract every email address, and phone number
        # Found in them
        #------------------------------------------------------------------------

        # Load up a new tab to handle this
        self._driver.execute_script("window.open('');")
        self._driver.switch_to.window(
            self._driver.window_handles[len(self._driver.window_handles) - 1])

        self._driver.get(self._site_content['url'])
        time.sleep(5)

        html_source = self._driver.find_element_by_tag_name(
            'body').get_attribute('innerHTML')
        extracted_numbers = ""
        extracted_emails = ""

        # Now we use regex to match all occurrences of email or phone numbers in the page source
        try:
            # _content['inner_title'] = self._driver.find_element_by_tag_name('title').text
            self._site_content['site_description'] = self._driver.find_element_by_xpath("//meta[@name='description']")\
              .get_attribute("content")
        except NoSuchElementException as e:
            println(
                f"Opps, we couldn't find a meta description for this website ({self._site_content['url']})",
                "warn")

        screen_shot_name = 'static/' + self._site_content["title"].replace(
            "[,\.!\*- ]", "_") + '.png'

        found_numbers = self.scan_for_numbers(html_source)
        found_emails = self.scan_for_emails(html_source)
        verified_numbers = self.extract_mobile_number(found_numbers)

        self._site_content['contact_number'] = verified_numbers
        self._site_content['contact_email'] = found_emails

        if len(verified_numbers) or len(found_emails):
            # Increase the size of the page for our screenshot
            self._driver.set_window_size(1920, 8000)
            self.screengrab(screen_shot_name)
            self._site_content['screen_shot'] = screen_shot_name

            # We are done with processing now lets add to our list
            self._data_extract.append(self._site_content)

        # Close new tab first
        self._driver.close()
        self._driver.switch_to.window(
            self._driver.window_handles[len(self._driver.window_handles) - 1])
Example #4
0
def main():
    executor_url = ""
    session_id = ""
    selected_browser = args.browser
    browser_driver_path = args.driver
    query = args.query
    start_page = args.start - 1
    stop_page = args.stop - 1

    if start_page < 0:
        start_page = 0  # If the user puts in 0, we auto make it one
    elif (stop_page - start_page) > 15:
        println("You cannot search more than 15 pages at a time")
    # elif (stop_page - start_page) > 15:
    #  print("You cannot be search more than 15 pages at a time")

# Determine what browser to use for this tool
    driver = determine_browser(selected_browser, browser_driver_path)
    if type(driver) == str:
        println(driver)
    else:
        executor_url = driver.command_executor._url
        session_id = driver.session_id

        # Maximize chrome height to highest
        driver.set_window_size(1920, 8000)

        println(f"Google's Query: {query}", "normal")
        extractor = Extractor(driver, query, start_page, stop_page)
        println("Congratulations, scraping complete", "normal")
        driver.close()
    def paginate_page(self, query):
        #------------------------------------------------------------------------
        # We are going to fetch all first 10 pages
        # Of google's result
        #------------------------------------------------------------------------
        start_url = f"https://www.google.com/search?q={query}&sourceid=chrome&ie=UTF-8"
        seek_number = 0
        seek_url_query = f"&start={seek_number}"

        # while self._page <= 9:
        while self._page <= self._stop_page:
            self._driver.get(start_url + f"&start={(self._page * 10)}")
            try:
                self.extract_page_content()
            except WebDriverException as e:
                println(f"Selenium error: {str(e)}", "warn")
            except TimeoutException as e:
                println(f"Timeout error: {str(e)}", "warn")

            self._page += 1
            self._data_extract = []

        println("Congratulations, scraping complete", "normal")
Example #6
0
    def extract_page_content(self):
        #------------------------------------------------------------------------
        # We are going to get all major links in a page
        # Match that they do not contain the words
        # "english", "translate" or "translation"
        # Any item that passes this page would be considered for scrapping
        #------------------------------------------------------------------------
        dictionary_words = [
            "english", "translate", "translation", "dictionary", "Thesaurus",
            "translations"
        ]
        response = self._driver.find_elements_by_css_selector("div.g")

        # Now we look through all search results
        for result in response:
            self._site_content = {
                'title': '',
                'url': '',
                'description': '',
                'site_description': '',
                'screen_shot': '',
                'contact_email': '',
                'contact_number': ''
            }

            google_result = result.find_element_by_css_selector("div.rc")

            self._site_content['title'] = google_result.find_element_by_css_selector("div.r")\
              .find_element_by_css_selector("h3.LC20lb.DKV0Md").text

            self._site_content['description'] = google_result.find_element_by_css_selector("div.s")\
              .find_element_by_css_selector("span.st").text

            self._site_content['url'] = google_result.find_element_by_css_selector("div.r")\
              .find_element_by_tag_name("a").get_attribute("href")

            if(not self.words_in_string(dictionary_words, self._site_content['title']) and \
              not self.words_in_string(dictionary_words, self._site_content['description'])):
                #------------------------------------------------------------------------
                # This website is not a dictionary, now we can start
                # scanning through to extract just
                # The data we need
                #------------------------------------------------------------------------
                if "youtube" in self._site_content['url']:
                    continue
                elif "facebook" in self._site_content['url']:
                    #------------------------------------------------------------------------
                    # First we split by "/"
                    # We check if the last "/" is empty in case the URL ended with "/"
                    # If its empty we use the second to last
                    # If its not empty we check if the value contains "?" meaning a query
                    # If it does, we still use second to last
                    #------------------------------------------------------------------------
                    split_page_url_list = self._site_content['url'].split("/")
                    page_name = ""

                    if split_page_url_list[len(split_page_url_list) - 1] == "":
                        page_name = split_page_url_list[
                            len(split_page_url_list) - 2]
                    else:
                        if "?" in split_page_url_list[len(split_page_url_list)
                                                      - 1]:
                            page_name = split_page_url_list[
                                len(split_page_url_list) - 2]
                        else:
                            page_name = split_page_url_list[
                                len(split_page_url_list) - 1]

                    self._site_content[
                        'url'] = f"https://web.facebook.com/pg/{page_name}/about/"

                try:
                    self.extract_info_from_link()
                except NoSuchElementException as e:
                    # Had cases where body element was empty, meaning the website didn't exist
                    # So since a new window was launched before that error,
                    # We have to close the window and switch back to the search result
                    self._driver.close()
                    self._driver.switch_to.window(self._driver.window_handles[
                        len(self._driver.window_handles) - 1])
                    println(
                        f"This website ({self._site_content['url']}) has an issue and could not be parsed",
                        "warn")
  selected_browser = args.browser
  browser_driver_path = args.driver
  query = args.query
  file_name = args.file
  start_page = args.start - 1
  stop_page = args.stop - 1

  if start_page < 0: start_page = 0 # If the user puts in 0, we auto make it one
  elif (stop_page - start_page) > 15:
    println("You cannot search more than 15 pages at a time")

 # Determine what browser to use for this tool
  driver = determine_browser(selected_browser, browser_driver_path)
  if type(driver) == str:
    println(driver)
  else:
    executor_url = driver.command_executor._url
    session_id = driver.session_id

    # Maximize chrome height to highest
    driver.set_window_size(1920, 8000)

    println(f"Google's Query: {query}", "normal")
    extractor = Extractor(driver, query, start_page, stop_page, file_name)
    driver.close()

try:
   main()
except Exception as e:
   println("Oops, something's off here", "fail")