def __check_programming_language(self, url):
     """
     Description:
     ============
     This method will try its level best to get the name of the programming
     language used to build the website.
     Notes:
     ======
     This method will heavily used URL class from url package
     :return:
     """
     self.__thread_semaphore.acquire()
     print("[+] ANALYSING PROGRAMMING LANGUAGE")
     # These are the popular programming languages used for designing websites
     language_names = {
         ".php": "PHP",
         ".jsp": "JSP",
         ".asp": "ASP",
         ".aspx": "ASPX",
         ".py": "PYTHON",
         ".pl": "PERL"
     }
     user_agent = UserAgent.get_user_agent()
     r = URL().get_request(url=url, user_agent=user_agent)
     if r is not None:
         soup = BeautifulSoup(r.content, "html.parser")
         for i in soup.find_all("a"):
             try:
                 partial_url = i.get("href")
                 if "http" not in partial_url:
                     new_url = URL.join_urls(url, partial_url)
                 else:
                     new_url = partial_url if URL.is_same_domain(
                         url, new_url) else ""
                 file_name = URL.get_file_name(new_url)
                 for i in language_names:
                     if i in file_name:
                         self.__programming_language_used = language_names[
                             i]
                         # Now we will update the programming language used into the database
                         InfoGatheringPhaseOneDatabase.update_programming_language(
                             self.__database_semaphore, self.__connection,
                             self.__project_id,
                             self.__programming_language_used)
                         break
                     if i in file_name:
                         break
             except Exception:
                 pass
     self.__thread_semaphore.release()
Exemple #2
0
 def crawl(self, url):
     """
     Description:
     ------------
     This will crawl the urls completely
     :param url: The url to be crawled
     :return: None
     """
     start_time = time.time()
     r = URL().get_request(url=url, user_agent=UserAgent.get_user_agent())
     end_time = time.time()
     total_time = end_time - start_time
     self.__bob_object.predict(total_time)
     if r is not None:
         soup = BeautifulSoup(r.content, "html.parser")
         # At this stage we have got the beautiful soup objects
         #First find all the href links
         for i in soup.find_all("a"):
             try:
                 partial_url = i.get("href")
                 url_to_be_scanned = None  # we will scan this urls
                 # Check if the partial url is actually a partial url
                 if "http" in partial_url:
                     if URL.is_same_domain(self.__base_url, partial_url):
                         if partial_url not in self.__crawled_urls:
                             self.__urls.put(partial_url)
                             self.__crawled_urls.append(partial_url)
                             url_to_be_scanned = partial_url
                 else:
                     full_url = URL.join_urls(self.__base_url, partial_url)
                     if full_url not in self.__crawled_urls:
                         self.__urls.put(full_url)
                         self.__crawled_urls.append(full_url)
                         url_to_be_scanned = full_url
                 # run a simple scan in the url
                 if url_to_be_scanned is not None:
                     print("[i] CURRENTLY SCANNING [GET]: ",
                           url_to_be_scanned)
                     # Make the scanning as a new process
                     SimpleScan(
                         project_id=self.__project_id,
                         thread_semaphore=self.__thread_semaphore,
                         database_semaphore=self.__database_semaphore,
                         url=url_to_be_scanned,
                         connection=self.__connection,
                         poc_object=self.__poc_object)
             except Exception as e:
                 print("[-] EXCEPTION OCCURED ", e)
     while not self.__urls.empty():
         self.crawl(self.__urls.get())