예제 #1
0
 def search(self):
     dl = Download.Download(self.verbose)
     convert = Converter.Converter(verbose=self.verbose)
     while self.Counter <= self.Limit and self.Counter <= 100:
         time.sleep(1)
         if self.verbose:
             p = ' [*] Google DOCX Search on page: ' + str(self.Counter)
             print helpers.color(p, firewall=True)
         try:
             urly = "https://www.google.com/search?q=site:" + \
                 self.Domain + "+filetype:docx&start=" + str(self.Counter)
         except Exception as e:
             error = "[!] Major issue with Google Search:" + str(e)
             print helpers.color(error, warning=True)
         try:
             r = requests.get(urly)
         except Exception as e:
             error = " [!] Fail during Request to Google (Check Connection):" + \
                 str(e)
             print helpers.color(error, warning=True)
         RawHtml = r.content
         soup = BeautifulSoup(RawHtml)
         # I use this to parse my results, for URLS to follow
         for a in soup.findAll('a'):
             try:
                 # https://stackoverflow.com/questions/21934004/not-getting-proper-links-
                 # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412?
                 # newreg=01f0ed80771f4dfaa269b15268b3f9a9
                 l = urlparse.parse_qs(
                     urlparse.urlparse(a['href']).query)['q'][0]
                 if l.startswith('http') or l.startswith('www'):
                     if "webcache.googleusercontent.com" not in l:
                         self.urlList.append(l)
             except:
                 pass
         self.Counter += 10
     # now download the required files
     try:
         for url in self.urlList:
             if self.verbose:
                 p = ' [*] Google DOCX search downloading: ' + str(url)
                 print helpers.color(p, firewall=True)
             try:
                 filetype = ".docx"
                 FileName, FileDownload = dl.download_file(url, filetype)
                 if FileDownload:
                     if self.verbose:
                         p = ' [*] Google DOCX file was downloaded: ' + \
                             str(url)
                         print helpers.color(p, firewall=True)
                     self.Text += convert.convert_docx_to_txt(FileName)
                 # print self.Text
             except Exception as e:
                 print helpers.color(" [!] Issue with Converting Docx Files\n", firewall=True)
             try:
                 dl.delete_file(FileName)
             except Exception as e:
                 print e
     except:
         print helpers.color(" [*] No DOCX's to download from Google!\n", firewall=True)
예제 #2
0
def test_converter():
    # test the convert for all formats
    p = os.path.dirname(os.path.realpath('.')) + '/SimplyEmail/tests/'
    c = Converter.Converter(verbose=True)
    print p
    text = c.convert_docx_to_txt(p + 'Test-DOCX.docx')
    assert text
    assert 'How to Design and Test' in text
    text = c.convert_doc_to_txt(p + 'Test-DOC.doc')
    assert text
    assert 'How to Design and Test' in text
    text = c.convert_pdf_to_txt(p + 'Test-PDF.pdf')
    assert text
    assert 'How to Design and Test' in text
    text = c.convert_zip_to_text(p + 'Test-PPTX.pptx')
    assert text
    assert 'Test SLIDE' in text
    assert 'Test SLIDE 2' in text
    assert 'Test SLIDE 3' in text
예제 #3
0
 def search(self):
     convert = Converter.Converter(self.verbose)
     dl = Download.Download(self.verbose)
     while self.Counter <= self.Limit and self.Counter <= 100:
         time.sleep(1)
         if self.verbose:
             p = ' [*] Google PPTX Search on page: ' + str(self.Counter)
             print helpers.color(p, firewall=True)
         try:
             url = "https://www.google.com/search?q=" + \
                 self.Domain + "+filetype:pptx&start=" + str(self.Counter)
         except Exception as e:
             error = " [!] Major issue with Google Search:" + str(e)
             print helpers.color(error, warning=True)
         try:
             RawHtml = dl.requesturl(url, useragent=self.UserAgent)
         except Exception as e:
             error = " [!] Fail during Request to Google (Check Connection):" + \
                 str(e)
             print helpers.color(error, warning=True)
         # check for captcha
         try:
             # Url = r.url
             dl.GoogleCaptchaDetection(RawHtml)
         except Exception as e:
             print e
         soup = BeautifulSoup(RawHtml)
         # I use this to parse my results, for URLS to follow
         for a in soup.findAll('a'):
             try:
                 # https://stackoverflow.com/questions/21934004/not-getting-proper-links-
                 # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412?
                 # newreg=01f0ed80771f4dfaa269b15268b3f9a9
                 l = urlparse.parse_qs(urlparse.urlparse(
                     a['href']).query)['q'][0]
                 if l.startswith('http') or l.startswith(
                         'www') or l.startswith('https'):
                     if "webcache.googleusercontent.com" not in l:
                         self.urlList.append(l)
                 # for some reason PPTX seems to be cached data:
                 l = urlparse.parse_qs(urlparse.urlparse(
                     a['href']).query)['q'][0]
                 l = l.split(':', 2)
                 if "webcache.googleusercontent.com" not in l[2]:
                     self.urlList.append(l[2])
             except:
                 pass
         self.Counter += 10
     # now download the required files
     try:
         for url in self.urlList:
             if self.verbose:
                 p = ' [*] Google PPTX search downloading: ' + str(url)
                 print helpers.color(p, firewall=True)
             try:
                 filetype = ".pptx"
                 FileName, FileDownload = dl.download_file2(url, filetype)
                 if FileDownload:
                     if self.verbose:
                         p = ' [*] Google PPTX file was downloaded: ' + \
                             str(url)
                         print helpers.color(p, firewall=True)
                     ft = helpers.filetype(FileName).lower()
                     if 'powerpoint' in ft:
                         self.Text += convert.convert_pptx_to_txt(FileName)
                     else:
                         self.logger.warning(
                             'Downloaded file is not a PPTX: ' + ft)
                 # print self.Text
             except Exception as e:
                 print helpers.color(" [!] Issue with opening PPTX Files\n",
                                     firewall=True)
             try:
                 dl.delete_file(FileName)
             except Exception as e:
                 print e
     except:
         print helpers.color(" [*] No PPTX to download from Google!\n",
                             firewall=True)
예제 #4
0
    def search(self):
        dl = Download.Download(self.verbose)
        convert = Converter.Converter(verbose=self.verbose)
        while self.Counter <= self.Limit and self.Counter <= 10:
            helpers.modsleep(1)
            if self.verbose:
                p = ' [*] Exalead PPTX Search on page: ' + str(self.Counter)
                self.logger.info('ExaleadPPTXSearch on page: ' +
                                 str(self.Counter))
                print helpers.color(p, firewall=True)
            try:
                url = 'http://www.exalead.com/search/web/results/?q="%40' + self.Domain + \
                      '"+filetype:pptx&elements_per_page=' + \
                    str(self.Quanity) + '&start_index=' + str(self.Counter)
            except Exception as e:
                self.logger.error('ExaleadPPTXSearch could not build URL')
                error = " [!] Major issue with Exalead PPTX Search: " + str(e)
                print helpers.color(error, warning=True)
            try:
                RawHtml = dl.requesturl(url, useragent=self.UserAgent)
                # sometimes url is broken but exalead search results contain
                # e-mail
                self.Text += RawHtml
                soup = BeautifulSoup(RawHtml, "lxml")
                self.urlList = [
                    h2.a["href"]
                    for h2 in soup.findAll('h4', class_='media-heading')
                ]
            except Exception as e:
                self.logger.error(
                    'ExaleadPPTXSearch could not request / parse HTML')
                error = " [!] Fail during parsing result: " + str(e)
                print helpers.color(error, warning=True)
            self.Counter += 30

        # now download the required files
        try:
            for url in self.urlList:
                if self.verbose:
                    p = ' [*] Exalead PPTX search downloading: ' + str(url)
                    self.logger.info('ExaleadPPTXSearch downloading: ' +
                                     str(url))
                    print helpers.color(p, firewall=True)
                try:
                    filetype = ".pptx"
                    dl = Download.Download(self.verbose)
                    FileName, FileDownload = dl.download_file(url, filetype)
                    if FileDownload:
                        if self.verbose:
                            p = ' [*] Exalead PPTX file was downloaded: ' + \
                                str(url)
                            self.logger.info('ExaleadDOCSearch downloaded: ' +
                                             str(p))
                            print helpers.color(p, firewall=True)
                        ft = helpers.filetype(FileName).lower()
                        if 'powerpoint' in ft:
                            self.Text += convert.convert_zip_to_text(FileName)
                        else:
                            self.logger.warning(
                                'Downloaded file is not a PPTX: ' + ft)
                except Exception as e:
                    error = " [!] Issue with opening PPTX Files:%s" % (str(e))
                    print helpers.color(error, warning=True)
                try:
                    dl.delete_file(FileName)
                except Exception as e:
                    print e
        except Exception as e:
            self.logger.error("ExaleadPPTXSearch no doc's to download")
            print helpers.color(" [*] No PPTX's to download from Exalead!\n",
                                firewall=True)

        if self.verbose:
            p = ' [*] Searching PPTX from Exalead Complete'
            print helpers.color(p, status=True)
예제 #5
0
    def search(self):
        convert = Converter.Converter(verbose=self.verbose)
        while self.Counter <= self.Limit and self.Counter <= 10:
            time.sleep(1)
            if self.verbose:
                p = ' [*] Exalead Search on page: ' + str(self.Counter)
                print helpers.color(p, firewall=True)
            try:
                url = 'http://www.exalead.com/search/web/results/?q="%40' + self.Domain + \
                      '"+filetype:pdf&elements_per_page=' + \
                    str(self.Quanity) + '&start_index=' + str(self.Counter)
            except Exception as e:
                error = " [!] Major issue with Exalead PDF Search: " + str(e)
                print helpers.color(error, warning=True)
            try:
                r = requests.get(url, headers=self.UserAgent)
            except Exception as e:
                error = " [!] Fail during Request to Exalead (Check Connection):" + str(
                    e)
                print helpers.color(error, warning=True)
            try:
                RawHtml = r.content
                # sometimes url is broken but exalead search results contain
                # e-mail
                self.Text += RawHtml
                soup = BeautifulSoup(RawHtml, "lxml")
                self.urlList = [h2.a["href"]
                                for h2 in soup.findAll('h4', class_='media-heading')]
            except Exception as e:
                error = " [!] Fail during parsing result: " + str(e)
                print helpers.color(error, warning=True)
            self.Counter += 30

        # now download the required files
        try:
            for url in self.urlList:
                if self.verbose:
                    p = ' [*] Exalead PDF search downloading: ' + str(url)
                    print helpers.color(p, firewall=True)
                try:
                    filetype = ".pdf"
                    dl = Download.Download(self.verbose)
                    FileName, FileDownload = dl.download_file(url, filetype)
                    if FileDownload:
                        if self.verbose:
                            p = ' [*] Exalead PDF file was downloaded: ' + \
                                str(url)
                            print helpers.color(p, firewall=True)
                        self.Text += convert.convert_pdf_to_txt(FileName)
                except Exception as e:
                    pass
                try:
                    dl.delete_file(FileName)
                except Exception as e:
                    print e
        except:
            print helpers.color(" [*] No PDF's to download from Exalead!\n", firewall=True)

        if self.verbose:
            p = ' [*] Searching PDF from Exalead Complete'
            print helpers.color(p, status=True)
예제 #6
0
    def search(self):
        convert = Converter.Converter(verbose=self.verbose)
        while self.Counter <= self.Limit:
            time.sleep(1)
            if self.verbose:
                p = ' [*] Exalead Search on page: ' + str(self.Counter)
                self.logger.info("ExaleadDOCXSearch on page: " +
                                 str(self.Counter))
                print helpers.color(p, firewall=True)
            try:
                url = 'http://www.exalead.com/search/web/results/?q="%40' + self.Domain + \
                      '"+filetype:docx&elements_per_page=' + \
                    str(self.Quanity) + '&start_index=' + str(self.Counter)
            except Exception as e:
                self.logger.error("Issue building URL to search")
                error = " [!] Major issue with Exalead DOCX Search: " + str(e)
                print helpers.color(error, warning=True)
            try:
                r = requests.get(url, headers=self.UserAgent)
            except Exception as e:
                error = " [!] Fail during Request to Exalead (Check Connection):" + str(
                    e)
                print helpers.color(error, warning=True)
            try:
                RawHtml = r.content
                # sometimes url is broken but exalead search results contain
                # e-mail
                self.Text += RawHtml
                soup = BeautifulSoup(RawHtml, "lxml")
                self.urlList = [
                    h2.a["href"]
                    for h2 in soup.findAll('h4', class_='media-heading')
                ]
            except Exception as e:
                self.logger.error("Fail during parsing result: " + str(e))
                error = " [!] Fail during parsing result: " + str(e)
                print helpers.color(error, warning=True)
            self.Counter += 30

        # now download the required files
        try:
            for url in self.urlList:
                if self.verbose:
                    p = ' [*] Exalead DOCX search downloading: ' + str(url)
                    self.logger.info("Starting download of DOCX: " + str(url))
                    print helpers.color(p, firewall=True)
                try:
                    filetype = ".docx"
                    dl = Download.Download(self.verbose)
                    FileName, FileDownload = dl.download_file(url, filetype)
                    if FileDownload:
                        if self.verbose:
                            self.logger.info("File was downloaded: " +
                                             str(url))
                            p = ' [*] Exalead DOCX file was downloaded: ' + \
                                str(url)
                            print helpers.color(p, firewall=True)
                        self.Text += convert.convert_docx_to_txt(FileName)
                except Exception as e:
                    self.logger.error("Issue with opening DOCX Files: " +
                                      str(e))
                    error = " [!] Issue with opening DOCX Files:%s\n" % (
                        str(e))
                    print helpers.color(error, warning=True)
                try:
                    dl.delete_file(FileName)
                except Exception as e:
                    print e
        except Exception as e:
            p = " [*] No DOCX's to download from Exalead: " + e
            self.logger.info("No DOCX's to download from Exalead: " + str(e))
            print helpers.color(p, firewall=True)

        if self.verbose:

            p = ' [*] Searching DOCX from Exalead Complete'
            self.logger.info("Searching DOCX from Exalead Complete")
            print helpers.color(p, status=True)
예제 #7
0
 def search(self):
     # setup for helpers in the download class
     convert = Converter.Converter(verbose=self.verbose)
     dl = Download.Download(self.verbose)
     while self.Counter <= self.Limit and self.Counter <= 100:
         time.sleep(1)
         if self.verbose:
             p = ' [*] Google PDF Search on page: ' + str(self.Counter)
             print helpers.color(p, firewall=True)
         try:
             urly = "https://www.google.com/search?q=site:" + \
                 self.Domain + "+filetype:pdf&start=" + str(self.Counter)
         except Exception as e:
             error = " [!] Major issue with Google Search:" + str(e)
             print helpers.color(error, warning=True)
         try:
             r = requests.get(urly)
         except Exception as e:
             error = " [!] Fail during Request to Google (Check Connection):" + \
                 str(e)
             print helpers.color(error, warning=True)
         RawHtml = r.content
         # get redirect URL
         # Url = r.url
         dl.GoogleCaptchaDetection(RawHtml)
         soup = BeautifulSoup(RawHtml)
         for a in soup.findAll('a'):
             try:
                 # https://stackoverflow.com/questions/21934004/not-getting-proper-links-
                 # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412?
                 # newreg=01f0ed80771f4dfaa269b15268b3f9a9
                 l = urlparse.parse_qs(
                     urlparse.urlparse(a['href']).query)['q'][0]
                 if l.startswith('http') or l.startswith('www'):
                     if "webcache.googleusercontent.com" not in l:
                         self.urlList.append(l)
             except:
                 pass
         self.Counter += 10
     # now download the required files
     try:
         for url in self.urlList:
             if self.verbose:
                 p = ' [*] Google PDF search downloading: ' + str(url)
                 print helpers.color(p, firewall=True)
             try:
                 filetype = ".pdf"
                 # use new helper class to download file
                 FileName, FileDownload = dl.download_file(url, filetype)
                 # check if the file was downloaded
                 if FileDownload:
                     if self.verbose:
                         p = ' [*] Google PDF file was downloaded: ' + \
                             str(url)
                         print helpers.color(p, firewall=True)
                     self.Text += convert.convert_pdf_to_txt(FileName)
             except Exception as e:
                 print e
             try:
                 # now remove any files left behind
                 dl.delete_file(FileName)
             except Exception as e:
                 print e
     except:
         print helpers.color(" [*] No PDF's to download from Google!\n", firewall=True)
예제 #8
0
 def search(self):
     convert = Converter.Converter(verbose=self.verbose)
     while self.Counter <= self.Limit and self.Counter <= 100:
         time.sleep(1)
         if self.verbose:
             p = " [*] Google XLSX Search on page: " + str(self.Counter)
             self.logger.info("Google XLSX Search on page: " +
                              str(self.Counter))
             print helpers.color(p, firewall=True)
         try:
             urly = ("https://www.google.com/search?q=site:" + self.Domain +
                     "+filetype:xlsx&start=" + str(self.Counter))
         except Exception as e:
             error = " [!] Major issue with Google XLSX Search:" + str(e)
             self.logger.error("GoogleXlsxSearch failed to build url: " +
                               str(e))
             print helpers.color(error, warning=True)
         try:
             r = requests.get(urly)
         except Exception as e:
             error = " [!] Fail during Request to Google (Check Connection):" + str(
                 e)
             self.logger.error(
                 "GoogleXlsxSearch failed to request url (Check Connection): "
                 + str(e))
             print helpers.color(error, warning=True)
         RawHtml = r.content
         soup = BeautifulSoup(RawHtml)
         # I use this to parse my results, for URLS to follow
         for a in soup.findAll("a"):
             try:
                 # https://stackoverflow.com/questions/21934004/not-getting-proper-links-
                 # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412?
                 # newreg=01f0ed80771f4dfaa269b15268b3f9a9
                 l = urlparse.parse_qs(urlparse.urlparse(
                     a["href"]).query)["q"][0]
                 if l.startswith("http") or l.startswith("www"):
                     if "webcache.googleusercontent.com" not in l:
                         self.urlList.append(l)
             except:
                 pass
         self.Counter += 10
         helpers.modsleep(self.Sleep, jitter=self.Jitter)
     # now download the required files
     self.logger.debug(
         "GoogleXlsxSearch completed HTML result query, starting downloads")
     try:
         for url in self.urlList:
             if self.verbose:
                 p = " [*] Google XLSX search downloading: " + str(url)
                 self.logger.info("Google XLSX search downloading: " +
                                  str(url))
                 print helpers.color(p, firewall=True)
             try:
                 filetype = ".xlsx"
                 dl = Download.Download(self.verbose)
                 FileName, FileDownload = dl.download_file(url, filetype)
                 if FileDownload:
                     if self.verbose:
                         p = " [*] Google XLSX file was downloaded: " + str(
                             url)
                         self.logger.info(
                             "Google XLSX file was downloaded: " + str(url))
                         print helpers.color(p, firewall=True)
                     self.Text += convert.convert_Xlsx_to_Csv(FileName)
                 # print self.Text
             except Exception as e:
                 print helpers.color(" [!] Issue with opening Xlsx Files\n",
                                     firewall=True)
                 self.logger.error("Google XLSX had issue opening file")
             try:
                 dl.delete_file(FileName)
             except Exception as e:
                 self.logger.error("Google XLSX failed to delete file: " +
                                   str(e))
     except Exception as e:
         print helpers.color(" [*] No XLSX's to download from google!\n",
                             firewall=True)
         self.logger.error("No XLSX's to download from google! " + str(e))