コード例 #1
0
    def parse(self, response):
        """Scrape a spider's HttpRequest.Response for links"""

        # sanity check
        if self._link_extractor is None:
            self._link_extractor = SgmlLinkExtractor()

        # use scrapy SgmlLinkExtractor to extract links
        try:
            links = self._link_extractor.extract_links(response)
        except SGMLParseError as e:
            # Page was poorly formatted, oh well
            _linkscraper_logger.error('Exception encountered when link extracting page')
            return []

        # add these links to our Url item
        urls = list()
        for link in links:
            url = ScrapedUrl()
            url['url'] = link.url
            url['domain'] = UrlUtility.get_domain(link.url)
            url['last_visited'] = datetime(1, 1, 1)
            if url not in urls:
                urls.append(url)

        return urls
コード例 #2
0
ファイル: link_scraper.py プロジェクト: ppoulsen/HTResearch
    def parse(self, response):
        """Scrape a spider's HttpRequest.Response for links"""

        # sanity check
        if self._link_extractor is None:
            self._link_extractor = SgmlLinkExtractor()

        # use scrapy SgmlLinkExtractor to extract links
        try:
            links = self._link_extractor.extract_links(response)
        except SGMLParseError as e:
            # Page was poorly formatted, oh well
            _linkscraper_logger.error(
                'Exception encountered when link extracting page')
            return []

        # add these links to our Url item
        urls = list()
        for link in links:
            url = ScrapedUrl()
            url['url'] = link.url
            url['domain'] = UrlUtility.get_domain(link.url)
            url['last_visited'] = datetime(1, 1, 1)
            if url not in urls:
                urls.append(url)

        return urls
コード例 #3
0
ファイル: models.py プロジェクト: RaikesSchoolDS/HTResearch
 def clean_organization_url(self):
     url = self.cleaned_data['organization_url']
     if url:
         try:
             return UrlUtility.get_domain(url)
         except:
             raise ValidationError("Oops! We couldn't recognize that URL's domain.")
     else:
         return None
コード例 #4
0
 def clean_organization_url(self):
     url = self.cleaned_data['organization_url']
     if url:
         try:
             return UrlUtility.get_domain(url)
         except:
             raise ValidationError(
                 "Oops! We couldn't recognize that URL's domain.")
     else:
         return None
コード例 #5
0
    def __init__(self, org_model, id):
        """
        Constructs a new SmallOrganization instance.

        Arguments:
            org_model (Organization): The organization from which to create a SmallOrganization.
            id (ObjectId): The database ID of the organization.
        """
        self.id = id
        self.page_rank_info = org_model.page_rank_info
        try:
            self.org_domain = UrlUtility.get_domain(org_model.organization_url, no_exception=False)
        except Exception:
            self.org_domain = None
        self.page_rank = None
        self.page_rank_weight = None
コード例 #6
0
ファイル: helper_classes.py プロジェクト: ppoulsen/HTResearch
    def __init__(self, org_model, id):
        """
        Constructs a new SmallOrganization instance.

        Arguments:
            org_model (Organization): The organization from which to create a SmallOrganization.
            id (ObjectId): The database ID of the organization.
        """
        self.id = id
        self.page_rank_info = org_model.page_rank_info
        try:
            self.org_domain = UrlUtility.get_domain(org_model.organization_url,
                                                    no_exception=False)
        except Exception:
            self.org_domain = None
        self.page_rank = None
        self.page_rank_weight = None
コード例 #7
0
    def check_valid_org(self, response):
        """
        Checks if the current page is a valid page for an organization's homepage.

        Arguments:
            reponse (Response): Scrapy Response object of the page that is to be scraped.

        Returns:
            True if it's a valid organization page or already in the database.
            False if it's not the homepage.
        """
        # If already in database, then valid
        url = OrgUrlScraper().parse(response)
        org_dto = self.org_dao().find(organization_url=url)
        if org_dto:
            return True

        # If not homepage, then return false and make sure homepage is added to scrape:
        home_url_obj = urlparse(response.request.url)
        if home_url_obj.path and home_url_obj.path is not '/':
            home_url = home_url_obj.scheme + '://' + home_url_obj.netloc + '/'
            home_domain = UrlUtility.get_domain(home_url)
            meta = URLMetadata(url=home_url,
                               domain=home_domain,
                               last_visited=datetime(1, 1, 1))
            self.url_frontier.put_url(meta)
            return False
        else:
            # this is homepage, scrape for keywords
            hxs = HtmlXPathSelector(response)
            site_text = hxs.select('//html//text()').extract()
            site_text = [
                element.strip() for element in site_text
                if element.strip() != ''
            ]

            for word in self._required_words:
                for sentence in site_text:
                    sentence = self._punctuation.sub(' ', sentence)
                    if word in sentence.lower():
                        return True
                        # no keyword found, check if we already added organization

        return False
コード例 #8
0
def request_organization(request):
    """
    Sends a request to the Request Organization page if the user is logged in.

    Returns:
        A rendered page containing the Request Organization form.
    """
    if 'user_id' not in request.session:
        logger.error('Bad request made for organization seed without login')
        return unauthorized(request)
    else:
        user_id = request.session['user_id']

    form = RequestOrgForm(request.POST or None)
    error = ''
    success = ''

    if request.method == 'POST':
        if form.is_valid():
            url = form.cleaned_data['url']
            dao = ctx.get_object('URLMetadataDAO')

            try:
                metadata = URLMetadata(url=url,
                                       domain=UrlUtility.get_domain(url))
            except ValueError:
                error = "Oops! We don't recognize that domain. Please try another."

            if not error:
                try:
                    dto = DTOConverter.to_dto(URLMetadataDTO, metadata)
                    dao.create_update(dto)
                    logger.info(
                        'Org seed with url={0} requested by user={1}'.format(
                            url, user_id))
                    success = 'Your request has been sent successfully!'
                except:
                    error = 'Something went wrong with your request. Please try again later.'

    return render(request, 'organization/request_organization.html', {
        'form': form,
        'success': success,
        'error': error
    })
コード例 #9
0
    def check_valid_org(self, response):
        """
        Checks if the current page is a valid page for an organization's homepage.

        Arguments:
            reponse (Response): Scrapy Response object of the page that is to be scraped.

        Returns:
            True if it's a valid organization page or already in the database.
            False if it's not the homepage.
        """
        # If already in database, then valid
        url = OrgUrlScraper().parse(response)
        org_dto = self.org_dao().find(organization_url=url)
        if org_dto:
            return True

        # If not homepage, then return false and make sure homepage is added to scrape:
        home_url_obj = urlparse(response.request.url)
        if home_url_obj.path and home_url_obj.path is not '/':
            home_url = home_url_obj.scheme + '://' + home_url_obj.netloc + '/'
            home_domain = UrlUtility.get_domain(home_url)
            meta = URLMetadata(url=home_url, domain=home_domain, last_visited=datetime(1, 1, 1))
            self.url_frontier.put_url(meta)
            return False
        else:
            # this is homepage, scrape for keywords
            hxs = HtmlXPathSelector(response)
            site_text = hxs.select('//html//text()').extract()
            site_text = [element.strip() for element in site_text if element.strip() != '']

            for word in self._required_words:
                for sentence in site_text:
                    sentence = self._punctuation.sub(' ', sentence)
                    if word in sentence.lower():
                        return True
                        # no keyword found, check if we already added organization

        return False
コード例 #10
0
def request_organization(request):
    """
    Sends a request to the Request Organization page if the user is logged in.

    Returns:
        A rendered page containing the Request Organization form.
    """
    if 'user_id' not in request.session:
        logger.error('Bad request made for organization seed without login')
        return unauthorized(request)
    else:
        user_id = request.session['user_id']

    form = RequestOrgForm(request.POST or None)
    error = ''
    success = ''

    if request.method == 'POST':
        if form.is_valid():
            url = form.cleaned_data['url']
            dao = ctx.get_object('URLMetadataDAO')

            try:
                metadata = URLMetadata(url=url, domain=UrlUtility.get_domain(url))
            except ValueError:
                error = "Oops! We don't recognize that domain. Please try another."

            if not error:
                try:
                    dto = DTOConverter.to_dto(URLMetadataDTO, metadata)
                    dao.create_update(dto)
                    logger.info('Org seed with url={0} requested by user={1}'.format(url, user_id))
                    success = 'Your request has been sent successfully!'
                except:
                    error = 'Something went wrong with your request. Please try again later.'

    return render(request, 'organization/request_organization.html', {'form': form, 'success': success, 'error': error})
コード例 #11
0
    def parse(self, response):
        """Scrape a spider's HttpRequest.Response for links"""

        # get domain
        try:
            org_domain = UrlUtility.get_domain(response.request.url, False)
        except Exception as e:
            _linkscraper_logger.error('Exception encountered when trying to find the domain of ' + response.request.url)

        # sanity check
        if self._link_extractor is None:
            self._link_extractor = SgmlLinkExtractor()

        # use scrapy SgmlLinkExtractor to extract links
        try:
            links = self._link_extractor.extract_links(response)
        except SGMLParseError as e:
            # Page was poorly formatted, oh well
            _linkscraper_logger.error('Exception encountered when PageRankInfo scraping page')
            return None

        # add these links to our Page Rank Info
        page_rank_info = {
            "total": 0,
            "total_with_self": 0,
            "references": []
        }
        for link in links:
            url = link.url
            try:
                domain = UrlUtility.get_domain(url, False)
            except Exception as e:
                _linkscraper_logger.error('Exception encountered when trying to find the domain of ' + url)
                continue
            ref_found = False
            for ref in page_rank_info["references"]:
                if ref["org_domain"] == domain:
                    ref_found = True
                    ref["count"] += 1
                    ref["pages"][0]["count"] += 1
                    page_rank_info["total_with_self"] += 1
                    if domain != org_domain:
                        page_rank_info["total"] += 1
                    break;
            if not ref_found:
                page_rank_info["references"].append(
                    {
                        "org_domain": domain,
                        "count": 1,
                        "pages": [
                            {
                                "url": response.url,
                                "count": 1
                            }
                        ]
                    }
                )
                page_rank_info["total_with_self"] += 1
                if domain != org_domain:
                    page_rank_info["total"] += 1

        return page_rank_info
コード例 #12
0
ファイル: link_scraper.py プロジェクト: ppoulsen/HTResearch
    def parse(self, response):
        """Scrape a spider's HttpRequest.Response for links"""

        # get domain
        try:
            org_domain = UrlUtility.get_domain(response.request.url, False)
        except Exception as e:
            _linkscraper_logger.error(
                'Exception encountered when trying to find the domain of ' +
                response.request.url)

        # sanity check
        if self._link_extractor is None:
            self._link_extractor = SgmlLinkExtractor()

        # use scrapy SgmlLinkExtractor to extract links
        try:
            links = self._link_extractor.extract_links(response)
        except SGMLParseError as e:
            # Page was poorly formatted, oh well
            _linkscraper_logger.error(
                'Exception encountered when PageRankInfo scraping page')
            return None

        # add these links to our Page Rank Info
        page_rank_info = {"total": 0, "total_with_self": 0, "references": []}
        for link in links:
            url = link.url
            try:
                domain = UrlUtility.get_domain(url, False)
            except Exception as e:
                _linkscraper_logger.error(
                    'Exception encountered when trying to find the domain of '
                    + url)
                continue
            ref_found = False
            for ref in page_rank_info["references"]:
                if ref["org_domain"] == domain:
                    ref_found = True
                    ref["count"] += 1
                    ref["pages"][0]["count"] += 1
                    page_rank_info["total_with_self"] += 1
                    if domain != org_domain:
                        page_rank_info["total"] += 1
                    break
            if not ref_found:
                page_rank_info["references"].append({
                    "org_domain":
                    domain,
                    "count":
                    1,
                    "pages": [{
                        "url": response.url,
                        "count": 1
                    }]
                })
                page_rank_info["total_with_self"] += 1
                if domain != org_domain:
                    page_rank_info["total"] += 1

        return page_rank_info