def __init__(self, *args, **kwargs): self.saved_path = os.getcwd() os.chdir(os.path.dirname(os.path.abspath(__file__))) super(StopTraffickingSpider, self).__init__(*args, **kwargs) self.scraper = StopTraffickingDotInScraper() self.first = True self.directory_results = None if not os.path.exists("Output/"): os.makedirs("Output/") else: try: os.remove("Output/specific_page_scraper_output.txt") except OSError: pass
class StopTraffickingSpider(BaseSpider): """A class that crawls a very specific site so we can have more data.""" name = "stop_trafficking" allowed_domains = ['stoptrafficking.in'] start_urls = ['http://www.stoptrafficking.in/Directory.aspx'] def __init__(self, *args, **kwargs): self.saved_path = os.getcwd() os.chdir(os.path.dirname(os.path.abspath(__file__))) super(StopTraffickingSpider, self).__init__(*args, **kwargs) self.scraper = StopTraffickingDotInScraper() self.first = True self.directory_results = None if not os.path.exists("Output/"): os.makedirs("Output/") else: try: os.remove("Output/specific_page_scraper_output.txt") except OSError: pass def __del__(self): os.chdir(self.saved_path) def parse(self, response): """Parse this super specific page""" # if first time through... if self.first: self.first = False results = self.scraper.parse_directory(response) # grab directory entries self.directory_results = results #return Requests for each Popup page for result in results: yield Request(result.popup_url) # grab corresponding table entry table_entry = next(entry for entry in self.directory_results if entry.popup_url == response.url) # cleanup if table_entry is not None: self.directory_results.remove(table_entry) items = self.scraper.parse_popup(response, table_entry) url_item = self._get_url_metadata(items) yield items yield url_item def _get_url_metadata(self, item): """ Gets the metadata for the page. Arguments: item (dictionary): Dictionary of a contact or an organization. Returns: url_item (dictionary): Dictionary of url metadata. """ if not isinstance(item, ScrapedOrganization) \ or item['organization_url'] is None or item['organization_url'] == "": return None url_item = ScrapedUrl() # Add http://'s since we removed them url_item['url'] = 'http://' + item['organization_url'] url_item['domain'] = UrlUtility.get_domain(item['organization_url']) url_item['last_visited'] = datetime(1, 1, 1) return url_item