Example #1
0
    def __init__(self, *args, **kwargs):
        self.saved_path = os.getcwd()
        os.chdir(os.path.dirname(os.path.abspath(__file__)))

        super(StopTraffickingSpider, self).__init__(*args, **kwargs)
        self.scraper = StopTraffickingDotInScraper()
        self.first = True
        self.directory_results = None

        if not os.path.exists("Output/"):
            os.makedirs("Output/")
        else:
            try:
                os.remove("Output/specific_page_scraper_output.txt")
            except OSError:
                pass
Example #2
0
class StopTraffickingSpider(BaseSpider):
    """A class that crawls a very specific site so we can have more data."""
    name = "stop_trafficking"
    allowed_domains = ['stoptrafficking.in']
    start_urls = ['http://www.stoptrafficking.in/Directory.aspx']

    def __init__(self, *args, **kwargs):
        self.saved_path = os.getcwd()
        os.chdir(os.path.dirname(os.path.abspath(__file__)))

        super(StopTraffickingSpider, self).__init__(*args, **kwargs)
        self.scraper = StopTraffickingDotInScraper()
        self.first = True
        self.directory_results = None

        if not os.path.exists("Output/"):
            os.makedirs("Output/")
        else:
            try:
                os.remove("Output/specific_page_scraper_output.txt")
            except OSError:
                pass

    def __del__(self):
        os.chdir(self.saved_path)

    def parse(self, response):
        """Parse this super specific page"""

        # if first time through...
        if self.first:
            self.first = False
            results = self.scraper.parse_directory(response)
            # grab directory entries
            self.directory_results = results
            #return Requests for each Popup page
            for result in results:
                yield Request(result.popup_url)

        # grab corresponding table entry 
        table_entry = next(entry for entry in self.directory_results if entry.popup_url == response.url)

        # cleanup
        if table_entry is not None:
            self.directory_results.remove(table_entry)

        items = self.scraper.parse_popup(response, table_entry)
        url_item = self._get_url_metadata(items)

        yield items
        yield url_item

    def _get_url_metadata(self, item):
        """
        Gets the metadata for the page.

        Arguments:
            item (dictionary): Dictionary of a contact or an organization.

        Returns:
            url_item (dictionary): Dictionary of url metadata.
        """
        if not isinstance(item, ScrapedOrganization) \
           or item['organization_url'] is None or item['organization_url'] == "":
            return None

        url_item = ScrapedUrl()
        # Add http://'s since we removed them
        url_item['url'] = 'http://' + item['organization_url']
        url_item['domain'] = UrlUtility.get_domain(item['organization_url'])
        url_item['last_visited'] = datetime(1, 1, 1)

        return url_item