def analyze(self): self.scraper.scrape(self.scraper_target_url) table_links = self.scraper.find('//table[@class="maintable"]//a/@href') links = self.scraper.parse_table_links(table_links) page_scraper = PageScraper("http://www.pastebin.com") # page_scraper = PageScraper(None) for link in links: log.info('Analyzing Link: {}'.format(link)) page_scraper.scrape(link) text = page_scraper.find('//textarea[@class="paste_code"]/text()') possible_passwords = None if text: possible_passwords = self.pw_identifier.identify_passwords( text[0]) if possible_passwords: self.password_matches.append((link, possible_passwords)) time.sleep(self.crawler_delay) return self.password_matches
def __init__(self, **kwargs): """ :param kwargs: base_url: (string) The base url of the site to scrape. Default is 'http://pastebin.com'. fast: (boolean) execute a fast scrape -- will not run pattern filters (which are slower). Will only run a keyword search on the text. Default is False. save_filtered: (boolean) If a text passes the filters with a score greater than 0, save the text to file. Default is False. :return: """ self.text_save_path = ROOT_DIR + '/filter_saves' self.metrics_save_path = ROOT_DIR + '/metric_saves' self.base_url = kwargs.get('base_url', self.base_url) self.fast = kwargs.get('fast', self.fast) self.ultra_verbose = kwargs.get('ultra_verbose', self.ultra_verbose) self.save_filtered = kwargs.get('save_filtered', self.save_filtered) self.scraper = PageScraper(url=self.base_url, scrape=False) self.pw_identifier = PWID(fast=self.fast, ultra_verbose=self.ultra_verbose) self.digestor = Digestor()
def __init__(self, url, use_selenium=False): """ Initiate the class """ PageScraper.__init__(self, url, use_selenium) self.additional_site_map_urls = [] self.page_urls = [] self.properties_to_not_encode = [ 'use_selenium', 'content', 'soup', 'driver', 'selenium_driver', 'selenium_wait_until', 'properties_to_not_encode', 'page_urls', 'additional_site_map_urls', 'content_location' ]
class PastebinScraper(object): pw_identifier = None scraper = None crawler_delay = .5 password_matches = [] base_url = 'http://pastebin.com' scraper_target_url = 'http://pastebin.com/archive' fast = False def __init__(self, **kwargs): self.base_url = kwargs.get('base_url', self.base_url) self.fast = kwargs.get('fast', self.fast) self.scraper = PageScraper(self.base_url) self.pw_identifier = PWID(fast=self.fast) def analyze(self): self.scraper.scrape(self.scraper_target_url) table_links = self.scraper.find('//table[@class="maintable"]//a/@href') links = self.scraper.parse_table_links(table_links) page_scraper = PageScraper("http://www.pastebin.com") # page_scraper = PageScraper(None) for link in links: log.info('Analyzing Link: {}'.format(link)) page_scraper.scrape(link) text = page_scraper.find('//textarea[@class="paste_code"]/text()') possible_passwords = None if text: possible_passwords = self.pw_identifier.identify_passwords( text[0]) if possible_passwords: self.password_matches.append((link, possible_passwords)) time.sleep(self.crawler_delay) return self.password_matches
def __init__(self, url, selenium_driver=None, selenium_wait_until=None, content_location='./data'): """ Initiate the class """ PageScraper.__init__(self, url, selenium_driver=selenium_driver, selenium_wait_until=selenium_wait_until, content_location=content_location) self.page_meta_data = None self.occupation_data = None self.expense_data = None self.days_data = None self.properties_to_not_encode = [ 'use_selenium', 'content', 'soup', 'driver', 'selenium_driver', 'selenium_wait_until', 'properties_to_not_encode' ]
def analyze(self): log.info("Scraping target: %s..." % self.scraper_target_url) self.scraper.scrape(self.scraper_target_url) log.info("Finding links...") table_links = self.scraper.find('//table[@class="maintable"]//a/@href') links = self.scraper.parse_table_links(table_links) page_scraper = PageScraper("http://www.pastebin.com", scrape=False) log.info("Links Found: %s" % len(links)) for link in links: log.info('Analyzing Link: {}'.format(link)) page_scraper.scrape(link) log.debug("Finding paste text area") text = page_scraper.find('//textarea[@class="paste_code"]/text()') possible_passwords = None score = 0 if text: log.debug("Running password identifier...") possible_passwords, score = self.pw_identifier.identify_passwords( text[0]) log.debug("Done") if possible_passwords: self.password_matches.append((link, score, possible_passwords)) if score > 0 and self.save_filtered: self._save_text(text[0]) time.sleep(self.crawler_delay) return self.password_matches
def to_json_serializable_obj(self): """ Creates a JSON serializable object """ obj = PageScraper.to_json_serializable_obj(self) obj['money_diary_page_urls'] = self.get_money_diary_page_urls() return obj
def __init__(self, **kwargs): self.base_url = kwargs.get('base_url', self.base_url) self.fast = kwargs.get('fast', self.fast) self.scraper = PageScraper(self.base_url) self.pw_identifier = PWID(fast=self.fast)
def test__get_page_soup_returns_bs4(self, mock_requests): mock_requests.get.return_value.content = b'<h1>Test</h1>' scrape = PageScraper('www.google.com') scrape._get_page_contents() soup = scrape._get_page_soup() self.assertEqual(soup.get_text(), 'Test')
def test__get_page_contents_returns_content(self, mock_requests): mock_requests.get.return_value.content = b'page content' scraper = PageScraper('www.google.com') self.assertEqual(scraper._get_page_contents(), 'page content')
def analyze(self): log.info("Scraping target: %s..." % self.scraper_target_url) self.scraper.scrape(self.scraper_target_url) log.info("Finding links...") table_links = self.scraper.find('//table[@class="maintable"]//a/@href') links = self.scraper.parse_table_links(table_links) page_scraper = PageScraper("http://www.pastebin.com", scrape=False) log.info("Links Found: %s" % len(links)) for link in links: if str(link) in self._cached: continue else: self._cached[str(link)] = str(link) log.info('Analyzing Link: {}'.format(link)) page_scraper.scrape(link) log.debug("Finding paste text area") text = page_scraper.find('//textarea[@class="paste_code"]/text()') possible_passwords = None score = 0 digestor_analytics = None formatted_text = '' if text: u_text = text[0].encode('utf-8') t_hash = hashlib.sha256() t_hash.update(u_text) text_digest = t_hash.hexdigest() # if text_digest not in self.already_hashed: if text_digest in self._cached: continue log.debug("Running password identifier...") possible_passwords, score = self.pw_identifier.identify_passwords( u_text) log.debug("Done") digestor_analytics = self.digestor.digest(u_text) # self.already_hashed[text_digest] = digestor_analytics # -- heading = '%s\n\n' % json.dumps(digestor_analytics, ensure_ascii=False) heading += '=' * 40 heading += '\n\n' unicode(heading, 'utf-8') formatted_text = unicode(heading + u_text) if possible_passwords: self.password_matches.append( (link, score, possible_passwords, digestor_analytics)) if score > 0 and self.save_filtered: self._save_text(formatted_text) time.sleep(self.crawler_delay) self._save_metrics(self.password_matches) return self.password_matches
class PastebinScraper(object): pw_identifier = None scraper = None digestor = None crawler_delay = .5 password_matches = [] base_url = 'http://pastebin.com' scraper_target_url = 'http://pastebin.com/archive' fast = False ultra_verbose = True save_filtered = True text_save_path = "" metrics_save_path = "" _cached = {} def __init__(self, **kwargs): """ :param kwargs: base_url: (string) The base url of the site to scrape. Default is 'http://pastebin.com'. fast: (boolean) execute a fast scrape -- will not run pattern filters (which are slower). Will only run a keyword search on the text. Default is False. save_filtered: (boolean) If a text passes the filters with a score greater than 0, save the text to file. Default is False. :return: """ self.text_save_path = ROOT_DIR + '/filter_saves' self.metrics_save_path = ROOT_DIR + '/metric_saves' self.base_url = kwargs.get('base_url', self.base_url) self.fast = kwargs.get('fast', self.fast) self.ultra_verbose = kwargs.get('ultra_verbose', self.ultra_verbose) self.save_filtered = kwargs.get('save_filtered', self.save_filtered) self.scraper = PageScraper(url=self.base_url, scrape=False) self.pw_identifier = PWID(fast=self.fast, ultra_verbose=self.ultra_verbose) self.digestor = Digestor() def analyze(self): log.info("Scraping target: %s..." % self.scraper_target_url) self.scraper.scrape(self.scraper_target_url) log.info("Finding links...") table_links = self.scraper.find('//table[@class="maintable"]//a/@href') links = self.scraper.parse_table_links(table_links) page_scraper = PageScraper("http://www.pastebin.com", scrape=False) log.info("Links Found: %s" % len(links)) for link in links: if str(link) in self._cached: continue else: self._cached[str(link)] = str(link) log.info('Analyzing Link: {}'.format(link)) page_scraper.scrape(link) log.debug("Finding paste text area") text = page_scraper.find('//textarea[@class="paste_code"]/text()') possible_passwords = None score = 0 digestor_analytics = None formatted_text = '' if text: u_text = text[0].encode('utf-8') t_hash = hashlib.sha256() t_hash.update(u_text) text_digest = t_hash.hexdigest() # if text_digest not in self.already_hashed: if text_digest in self._cached: continue log.debug("Running password identifier...") possible_passwords, score = self.pw_identifier.identify_passwords( u_text) log.debug("Done") digestor_analytics = self.digestor.digest(u_text) # self.already_hashed[text_digest] = digestor_analytics # -- heading = '%s\n\n' % json.dumps(digestor_analytics, ensure_ascii=False) heading += '=' * 40 heading += '\n\n' unicode(heading, 'utf-8') formatted_text = unicode(heading + u_text) if possible_passwords: self.password_matches.append( (link, score, possible_passwords, digestor_analytics)) if score > 0 and self.save_filtered: self._save_text(formatted_text) time.sleep(self.crawler_delay) self._save_metrics(self.password_matches) return self.password_matches def _save_text(self, text): file_path = '%s/%s.txt' % (self.text_save_path, uuid.uuid4()) try: with open(file_path, 'w+') as f: f.write(text) except IOError: log.error('Could not write text to file %s' % file_path) def _save_metrics(self, metrics_list): file_path = '%s/%s.txt' % (self.metrics_save_path, uuid.uuid4()) try: with open(file_path, 'w+') as f: for metric in metrics_list: f.write('%s\n' % json.dumps(metric)) except IOError: log.error('Could not write metric to file %s' % file_path) def clear_passwords(self): self.password_matches = []
class PastebinScraper(object): pw_identifier = None scraper = None crawler_delay = .5 password_matches = [] base_url = 'http://pastebin.com' scraper_target_url = 'http://pastebin.com/archive' fast = False ultra_verbose = True save_filtered = False text_save_path = "" def __init__(self, **kwargs): """ :param kwargs: base_url: (string) The base url of the site to scrape. Default is 'http://pastebin.com'. fast: (boolean) execute a fast scrape -- will not run pattern filters (which are slower). Will only run a keyword search on the text. Default is False. save_filtered: (boolean) If a text passes the filters with a score greater than 0, save the text to file. Default is False. :return: """ self.text_save_path = ROOT_DIR + '/filter_saves' self.base_url = kwargs.get('base_url', self.base_url) self.fast = kwargs.get('fast', self.fast) self.ultra_verbose = kwargs.get('ultra_verbose', self.ultra_verbose) self.save_filtered = kwargs.get('save_filtered', self.save_filtered) self.scraper = PageScraper(url=self.base_url, scrape=False) self.pw_identifier = PWID(fast=self.fast, ultra_verbose=self.ultra_verbose) def analyze(self): log.info("Scraping target: %s..." % self.scraper_target_url) self.scraper.scrape(self.scraper_target_url) log.info("Finding links...") table_links = self.scraper.find('//table[@class="maintable"]//a/@href') links = self.scraper.parse_table_links(table_links) page_scraper = PageScraper("http://www.pastebin.com", scrape=False) log.info("Links Found: %s" % len(links)) for link in links: log.info('Analyzing Link: {}'.format(link)) page_scraper.scrape(link) log.debug("Finding paste text area") text = page_scraper.find('//textarea[@class="paste_code"]/text()') possible_passwords = None score = 0 if text: log.debug("Running password identifier...") possible_passwords, score = self.pw_identifier.identify_passwords( text[0]) log.debug("Done") if possible_passwords: self.password_matches.append((link, score, possible_passwords)) if score > 0 and self.save_filtered: self._save_text(text[0]) time.sleep(self.crawler_delay) return self.password_matches def _save_text(self, text): file_path = '%s/%s.txt' % (self.text_save_path, uuid.uuid4()) try: with open(file_path, 'w+') as f: f.write(text) except IOError: log.error('Could not write text to file %s' % file_path)