def search(self): convert = Converter.Converter(self.verbose) dl = Download.Download(self.verbose) while self.Counter <= self.Limit and self.Counter <= 100: time.sleep(1) if self.verbose: p = ' [*] Google PPTX Search on page: ' + str(self.Counter) print helpers.color(p, firewall=True) try: url = "https://www.google.com/search?q=" + \ self.Domain + "+filetype:pptx&start=" + str(self.Counter) except Exception as e: error = " [!] Major issue with Google Search:" + str(e) print helpers.color(error, warning=True) try: RawHtml = dl.requesturl(url, useragent=self.UserAgent) except Exception as e: error = " [!] Fail during Request to Google (Check Connection):" + \ str(e) print helpers.color(error, warning=True) # check for captcha try: # Url = r.url dl.GoogleCaptchaDetection(RawHtml) except Exception as e: print e soup = BeautifulSoup(RawHtml) # I use this to parse my results, for URLS to follow for a in soup.findAll('a'): try: # https://stackoverflow.com/questions/21934004/not-getting-proper-links- # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412? # newreg=01f0ed80771f4dfaa269b15268b3f9a9 l = urlparse.parse_qs(urlparse.urlparse( a['href']).query)['q'][0] if l.startswith('http') or l.startswith( 'www') or l.startswith('https'): if "webcache.googleusercontent.com" not in l: self.urlList.append(l) # for some reason PPTX seems to be cached data: l = urlparse.parse_qs(urlparse.urlparse( a['href']).query)['q'][0] l = l.split(':', 2) if "webcache.googleusercontent.com" not in l[2]: self.urlList.append(l[2]) except: pass self.Counter += 10 # now download the required files try: for url in self.urlList: if self.verbose: p = ' [*] Google PPTX search downloading: ' + str(url) print helpers.color(p, firewall=True) try: filetype = ".pptx" FileName, FileDownload = dl.download_file2(url, filetype) if FileDownload: if self.verbose: p = ' [*] Google PPTX file was downloaded: ' + \ str(url) print helpers.color(p, firewall=True) ft = helpers.filetype(FileName).lower() if 'powerpoint' in ft: self.Text += convert.convert_pptx_to_txt(FileName) else: self.logger.warning( 'Downloaded file is not a PPTX: ' + ft) # print self.Text except Exception as e: print helpers.color(" [!] Issue with opening PPTX Files\n", firewall=True) try: dl.delete_file(FileName) except Exception as e: print e except: print helpers.color(" [*] No PPTX to download from Google!\n", firewall=True)
def search(self): dl = Download.Download(self.verbose) convert = Converter.Converter(verbose=self.verbose) while self.Counter <= self.Limit and self.Counter <= 10: helpers.modsleep(1) if self.verbose: p = ' [*] Exalead PPTX Search on page: ' + str(self.Counter) self.logger.info('ExaleadPPTXSearch on page: ' + str(self.Counter)) print helpers.color(p, firewall=True) try: url = 'http://www.exalead.com/search/web/results/?q="%40' + self.Domain + \ '"+filetype:pptx&elements_per_page=' + \ str(self.Quanity) + '&start_index=' + str(self.Counter) except Exception as e: self.logger.error('ExaleadPPTXSearch could not build URL') error = " [!] Major issue with Exalead PPTX Search: " + str(e) print helpers.color(error, warning=True) try: RawHtml = dl.requesturl(url, useragent=self.UserAgent) # sometimes url is broken but exalead search results contain # e-mail self.Text += RawHtml soup = BeautifulSoup(RawHtml, "lxml") self.urlList = [ h2.a["href"] for h2 in soup.findAll('h4', class_='media-heading') ] except Exception as e: self.logger.error( 'ExaleadPPTXSearch could not request / parse HTML') error = " [!] Fail during parsing result: " + str(e) print helpers.color(error, warning=True) self.Counter += 30 # now download the required files try: for url in self.urlList: if self.verbose: p = ' [*] Exalead PPTX search downloading: ' + str(url) self.logger.info('ExaleadPPTXSearch downloading: ' + str(url)) print helpers.color(p, firewall=True) try: filetype = ".pptx" dl = Download.Download(self.verbose) FileName, FileDownload = dl.download_file(url, filetype) if FileDownload: if self.verbose: p = ' [*] Exalead PPTX file was downloaded: ' + \ str(url) self.logger.info('ExaleadDOCSearch downloaded: ' + str(p)) print helpers.color(p, firewall=True) ft = helpers.filetype(FileName).lower() if 'powerpoint' in ft: self.Text += convert.convert_zip_to_text(FileName) else: self.logger.warning( 'Downloaded file is not a PPTX: ' + ft) except Exception as e: error = " [!] Issue with opening PPTX Files:%s" % (str(e)) print helpers.color(error, warning=True) try: dl.delete_file(FileName) except Exception as e: print e except Exception as e: self.logger.error("ExaleadPPTXSearch no doc's to download") print helpers.color(" [*] No PPTX's to download from Exalead!\n", firewall=True) if self.verbose: p = ' [*] Searching PPTX from Exalead Complete' print helpers.color(p, status=True)
def search(self): dl = Download.Download(self.verbose) convert = Converter.Converter(verbose=self.verbose) while self.Counter <= self.Limit and self.Counter <= 10: helpers.modsleep(1) if self.verbose: p = ' [*] Exalead PPTX Search on page: ' + str(self.Counter) self.logger.info('ExaleadPPTXSearch on page: ' + str(self.Counter)) print helpers.color(p, firewall=True) try: url = 'http://www.exalead.com/search/web/results/?q="%40' + self.Domain + \ '"+filetype:pptx&elements_per_page=' + \ str(self.Quanity) + '&start_index=' + str(self.Counter) except Exception as e: self.logger.error('ExaleadPPTXSearch could not build URL') error = " [!] Major issue with Exalead PPTX Search: " + str(e) print helpers.color(error, warning=True) try: RawHtml = dl.requesturl(url, useragent=self.UserAgent) # sometimes url is broken but exalead search results contain # e-mail self.Text += RawHtml soup = BeautifulSoup(RawHtml, "lxml") self.urlList = [h2.a["href"] for h2 in soup.findAll('h4', class_='media-heading')] except Exception as e: self.logger.error('ExaleadPPTXSearch could not request / parse HTML') error = " [!] Fail during parsing result: " + str(e) print helpers.color(error, warning=True) self.Counter += 30 # now download the required files try: for url in self.urlList: if self.verbose: p = ' [*] Exalead PPTX search downloading: ' + str(url) self.logger.info('ExaleadPPTXSearch downloading: ' + str(url)) print helpers.color(p, firewall=True) try: filetype = ".pptx" dl = Download.Download(self.verbose) FileName, FileDownload = dl.download_file(url, filetype) if FileDownload: if self.verbose: p = ' [*] Exalead PPTX file was downloaded: ' + \ str(url) self.logger.info('ExaleadDOCSearch downloaded: ' + str(p)) print helpers.color(p, firewall=True) ft = helpers.filetype(FileName).lower() if 'powerpoint' in ft: self.Text += convert.convert_zip_to_text(FileName) else: self.logger.warning('Downloaded file is not a PPTX: ' + ft) except Exception as e: error = " [!] Issue with opening PPTX Files:%s" % (str(e)) print helpers.color(error, warning=True) try: dl.delete_file(FileName) except Exception as e: print e except Exception as e: self.logger.error("ExaleadPPTXSearch no doc's to download") print helpers.color(" [*] No PPTX's to download from Exalead!\n", firewall=True) if self.verbose: p = ' [*] Searching PPTX from Exalead Complete' print helpers.color(p, status=True)
def search(self): convert = Converter.Converter(self.verbose) dl = Download.Download(self.verbose) while self.Counter <= self.Limit and self.Counter <= 100: time.sleep(1) if self.verbose: p = ' [*] Google PPTX Search on page: ' + str(self.Counter) print helpers.color(p, firewall=True) try: url = "https://www.google.com/search?q=" + \ self.Domain + "+filetype:pptx&start=" + str(self.Counter) except Exception as e: error = " [!] Major issue with Google Search:" + str(e) print helpers.color(error, warning=True) try: RawHtml = dl.requesturl(url, useragent=self.UserAgent) except Exception as e: error = " [!] Fail during Request to Google (Check Connection):" + \ str(e) print helpers.color(error, warning=True) # check for captcha try: # Url = r.url dl.GoogleCaptchaDetection(RawHtml) except Exception as e: print e soup = BeautifulSoup(RawHtml) # I use this to parse my results, for URLS to follow for a in soup.findAll('a'): try: # https://stackoverflow.com/questions/21934004/not-getting-proper-links- # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412? # newreg=01f0ed80771f4dfaa269b15268b3f9a9 l = urlparse.parse_qs(urlparse.urlparse(a['href']).query)['q'][0] if l.startswith('http') or l.startswith('www') or l.startswith('https'): if "webcache.googleusercontent.com" not in l: self.urlList.append(l) # for some reason PPTX seems to be cached data: l = urlparse.parse_qs(urlparse.urlparse(a['href']).query)['q'][0] l = l.split(':', 2) if "webcache.googleusercontent.com" not in l[2]: self.urlList.append(l[2]) except: pass self.Counter += 10 # now download the required files try: for url in self.urlList: if self.verbose: p = ' [*] Google PPTX search downloading: ' + str(url) print helpers.color(p, firewall=True) try: filetype = ".pptx" FileName, FileDownload = dl.download_file2(url, filetype) if FileDownload: if self.verbose: p = ' [*] Google PPTX file was downloaded: ' + \ str(url) print helpers.color(p, firewall=True) ft = helpers.filetype(FileName).lower() if 'powerpoint' in ft: # self.Text += convert.convert_zip_to_text(FileName) self.Text += convert.convert_zip_to_text(FileName) else: self.logger.warning('Downloaded file is not a PPTX: ' + ft) # print self.Text except Exception as e: print helpers.color(" [!] Issue with opening PPTX Files\n", firewall=True) try: if FileDownload: dl.delete_file(FileName) except Exception as e: self.logger.warning('Issue deleting file: ' + str(e)) except: print helpers.color(" [*] No CSV to download from Google!\n", firewall=True)