def parse(self, response): """Scrape a spider's HttpRequest.Response for links""" # sanity check if self._link_extractor is None: self._link_extractor = SgmlLinkExtractor() # use scrapy SgmlLinkExtractor to extract links try: links = self._link_extractor.extract_links(response) except SGMLParseError as e: # Page was poorly formatted, oh well _linkscraper_logger.error('Exception encountered when link extracting page') return [] # add these links to our Url item urls = list() for link in links: url = ScrapedUrl() url['url'] = link.url url['domain'] = UrlUtility.get_domain(link.url) url['last_visited'] = datetime(1, 1, 1) if url not in urls: urls.append(url) return urls
def parse(self, response): """Scrape a spider's HttpRequest.Response for links""" # sanity check if self._link_extractor is None: self._link_extractor = SgmlLinkExtractor() # use scrapy SgmlLinkExtractor to extract links try: links = self._link_extractor.extract_links(response) except SGMLParseError as e: # Page was poorly formatted, oh well _linkscraper_logger.error( 'Exception encountered when link extracting page') return [] # add these links to our Url item urls = list() for link in links: url = ScrapedUrl() url['url'] = link.url url['domain'] = UrlUtility.get_domain(link.url) url['last_visited'] = datetime(1, 1, 1) if url not in urls: urls.append(url) return urls
def clean_organization_url(self): url = self.cleaned_data['organization_url'] if url: try: return UrlUtility.get_domain(url) except: raise ValidationError("Oops! We couldn't recognize that URL's domain.") else: return None
def clean_organization_url(self): url = self.cleaned_data['organization_url'] if url: try: return UrlUtility.get_domain(url) except: raise ValidationError( "Oops! We couldn't recognize that URL's domain.") else: return None
def __init__(self, org_model, id): """ Constructs a new SmallOrganization instance. Arguments: org_model (Organization): The organization from which to create a SmallOrganization. id (ObjectId): The database ID of the organization. """ self.id = id self.page_rank_info = org_model.page_rank_info try: self.org_domain = UrlUtility.get_domain(org_model.organization_url, no_exception=False) except Exception: self.org_domain = None self.page_rank = None self.page_rank_weight = None
def _merge_page_rank_info(self, new_references, existing_references, organization_url): if existing_references is None: return new_references org_domain = UrlUtility().get_domain(organization_url) for ref in new_references.references: ref_exists = False # Search for existing references from one organization to another for exist_ref in existing_references.references: if ref.org_domain == exist_ref.org_domain: # We found existing data for references from Org A to Org B ref_exists = True for page in ref.pages: page_exists = False # Search if we have data from this specific URL to this specific organization for exist_page in exist_ref.pages: if page.url == exist_page.url: # We found existing data for references from URL A to Org B page_exists = True count_diff = page.count - exist_page.count if count_diff != 0: # This page must have changed b/c the number of references is different # update everything exist_page.count = page.count exist_ref.count += count_diff existing_references.total_with_self += count_diff if exist_ref.org_domain != org_domain: # This value only updated if Organization A and B are different existing_references.total += count_diff break if not page_exists: # We have recorded other references to this organization, but none from this url exist_ref.pages.append(page) exist_ref.count += page.count existing_references.total_with_self += page.count if exist_ref.org_domain != org_domain: existing_references.total += page.count break # If this organization has not yet referenced the specified outside org, add it if not ref_exists: existing_references.references.append(ref) existing_references.total_with_self += ref.count if ref.org_domain != org_domain: existing_references.total += ref.count return existing_references
def check_valid_org(self, response): """ Checks if the current page is a valid page for an organization's homepage. Arguments: reponse (Response): Scrapy Response object of the page that is to be scraped. Returns: True if it's a valid organization page or already in the database. False if it's not the homepage. """ # If already in database, then valid url = OrgUrlScraper().parse(response) org_dto = self.org_dao().find(organization_url=url) if org_dto: return True # If not homepage, then return false and make sure homepage is added to scrape: home_url_obj = urlparse(response.request.url) if home_url_obj.path and home_url_obj.path is not '/': home_url = home_url_obj.scheme + '://' + home_url_obj.netloc + '/' home_domain = UrlUtility.get_domain(home_url) meta = URLMetadata(url=home_url, domain=home_domain, last_visited=datetime(1, 1, 1)) self.url_frontier.put_url(meta) return False else: # this is homepage, scrape for keywords hxs = HtmlXPathSelector(response) site_text = hxs.select('//html//text()').extract() site_text = [ element.strip() for element in site_text if element.strip() != '' ] for word in self._required_words: for sentence in site_text: sentence = self._punctuation.sub(' ', sentence) if word in sentence.lower(): return True # no keyword found, check if we already added organization return False
def request_organization(request): """ Sends a request to the Request Organization page if the user is logged in. Returns: A rendered page containing the Request Organization form. """ if 'user_id' not in request.session: logger.error('Bad request made for organization seed without login') return unauthorized(request) else: user_id = request.session['user_id'] form = RequestOrgForm(request.POST or None) error = '' success = '' if request.method == 'POST': if form.is_valid(): url = form.cleaned_data['url'] dao = ctx.get_object('URLMetadataDAO') try: metadata = URLMetadata(url=url, domain=UrlUtility.get_domain(url)) except ValueError: error = "Oops! We don't recognize that domain. Please try another." if not error: try: dto = DTOConverter.to_dto(URLMetadataDTO, metadata) dao.create_update(dto) logger.info( 'Org seed with url={0} requested by user={1}'.format( url, user_id)) success = 'Your request has been sent successfully!' except: error = 'Something went wrong with your request. Please try again later.' return render(request, 'organization/request_organization.html', { 'form': form, 'success': success, 'error': error })
def clean_url(self): url = self.cleaned_data['url'] ctx = ApplicationContext(DAOContext()) org_dao = ctx.get_object('OrganizationDAO') url_metadata_dao = ctx.get_object('URLMetadataDAO') try: domain = UrlUtility().get_domain(url) except: raise ValidationError( "Oops! We couldn't find information on that domain.") if org_dao.find(organization_url=domain) or url_metadata_dao.find( domain=domain): raise ValidationError( "Oops! Looks like we already have information on that organization." ) return url
def check_valid_org(self, response): """ Checks if the current page is a valid page for an organization's homepage. Arguments: reponse (Response): Scrapy Response object of the page that is to be scraped. Returns: True if it's a valid organization page or already in the database. False if it's not the homepage. """ # If already in database, then valid url = OrgUrlScraper().parse(response) org_dto = self.org_dao().find(organization_url=url) if org_dto: return True # If not homepage, then return false and make sure homepage is added to scrape: home_url_obj = urlparse(response.request.url) if home_url_obj.path and home_url_obj.path is not '/': home_url = home_url_obj.scheme + '://' + home_url_obj.netloc + '/' home_domain = UrlUtility.get_domain(home_url) meta = URLMetadata(url=home_url, domain=home_domain, last_visited=datetime(1, 1, 1)) self.url_frontier.put_url(meta) return False else: # this is homepage, scrape for keywords hxs = HtmlXPathSelector(response) site_text = hxs.select('//html//text()').extract() site_text = [element.strip() for element in site_text if element.strip() != ''] for word in self._required_words: for sentence in site_text: sentence = self._punctuation.sub(' ', sentence) if word in sentence.lower(): return True # no keyword found, check if we already added organization return False
def request_organization(request): """ Sends a request to the Request Organization page if the user is logged in. Returns: A rendered page containing the Request Organization form. """ if 'user_id' not in request.session: logger.error('Bad request made for organization seed without login') return unauthorized(request) else: user_id = request.session['user_id'] form = RequestOrgForm(request.POST or None) error = '' success = '' if request.method == 'POST': if form.is_valid(): url = form.cleaned_data['url'] dao = ctx.get_object('URLMetadataDAO') try: metadata = URLMetadata(url=url, domain=UrlUtility.get_domain(url)) except ValueError: error = "Oops! We don't recognize that domain. Please try another." if not error: try: dto = DTOConverter.to_dto(URLMetadataDTO, metadata) dao.create_update(dto) logger.info('Org seed with url={0} requested by user={1}'.format(url, user_id)) success = 'Your request has been sent successfully!' except: error = 'Something went wrong with your request. Please try again later.' return render(request, 'organization/request_organization.html', {'form': form, 'success': success, 'error': error})
def parse(self, response): """Scrape a spider's HttpRequest.Response for links""" # get domain try: org_domain = UrlUtility.get_domain(response.request.url, False) except Exception as e: _linkscraper_logger.error('Exception encountered when trying to find the domain of ' + response.request.url) # sanity check if self._link_extractor is None: self._link_extractor = SgmlLinkExtractor() # use scrapy SgmlLinkExtractor to extract links try: links = self._link_extractor.extract_links(response) except SGMLParseError as e: # Page was poorly formatted, oh well _linkscraper_logger.error('Exception encountered when PageRankInfo scraping page') return None # add these links to our Page Rank Info page_rank_info = { "total": 0, "total_with_self": 0, "references": [] } for link in links: url = link.url try: domain = UrlUtility.get_domain(url, False) except Exception as e: _linkscraper_logger.error('Exception encountered when trying to find the domain of ' + url) continue ref_found = False for ref in page_rank_info["references"]: if ref["org_domain"] == domain: ref_found = True ref["count"] += 1 ref["pages"][0]["count"] += 1 page_rank_info["total_with_self"] += 1 if domain != org_domain: page_rank_info["total"] += 1 break; if not ref_found: page_rank_info["references"].append( { "org_domain": domain, "count": 1, "pages": [ { "url": response.url, "count": 1 } ] } ) page_rank_info["total_with_self"] += 1 if domain != org_domain: page_rank_info["total"] += 1 return page_rank_info
def parse(self, response): """Scrape a spider's HttpRequest.Response for links""" # get domain try: org_domain = UrlUtility.get_domain(response.request.url, False) except Exception as e: _linkscraper_logger.error( 'Exception encountered when trying to find the domain of ' + response.request.url) # sanity check if self._link_extractor is None: self._link_extractor = SgmlLinkExtractor() # use scrapy SgmlLinkExtractor to extract links try: links = self._link_extractor.extract_links(response) except SGMLParseError as e: # Page was poorly formatted, oh well _linkscraper_logger.error( 'Exception encountered when PageRankInfo scraping page') return None # add these links to our Page Rank Info page_rank_info = {"total": 0, "total_with_self": 0, "references": []} for link in links: url = link.url try: domain = UrlUtility.get_domain(url, False) except Exception as e: _linkscraper_logger.error( 'Exception encountered when trying to find the domain of ' + url) continue ref_found = False for ref in page_rank_info["references"]: if ref["org_domain"] == domain: ref_found = True ref["count"] += 1 ref["pages"][0]["count"] += 1 page_rank_info["total_with_self"] += 1 if domain != org_domain: page_rank_info["total"] += 1 break if not ref_found: page_rank_info["references"].append({ "org_domain": domain, "count": 1, "pages": [{ "url": response.url, "count": 1 }] }) page_rank_info["total_with_self"] += 1 if domain != org_domain: page_rank_info["total"] += 1 return page_rank_info