def associate_project_metrics(project): """ Checks to ensure that all the parsed domains of the given project are represented by a URLMetrics association. A new URLMetrics object is created if an appropriate one does not exist. Args: project (UserProject): The project to check. """ for pd in project.projectdomain_set.all(): metric_associated = False for um in project.urlmetrics.all(): if um.query_url == pd.domain: metric_associated = True break if not metric_associated: newum = None try: newum = URLMetrics.objects.get(query_url=pd.domain) except URLMetrics.DoesNotExist: newum = URLMetrics(query_url=query_url.pd_domain) newum.save() pm = ProjectMetrics(project=project, urlmetrics=newum, is_checked=False, is_extension=False) pm.save()
def handle(self, *args, **options): pm_total = 0 pm_checked = 0 dup_removed = 0 with transaction.atomic(): for p in UserProject.objects.all(): for pm in ProjectMetrics.objects.filter(project=p, is_extension=False): pm.delete() for pm in ProjectMetrics.objects.filter(project=p, is_extension=True): if pm.urlmetrics.is_uptodate(): pm_checked += 1 pm.is_checked = True else: pm.is_checked = False for pd in p.projectdomain_set.filter(state=u'available'): ums = URLMetrics.objects.filter( query_url=pd.domain).order_by(u'-last_updated') if len(ums) == 0: um = URLMetrics(query_url=pd.domain) um.save() pm = ProjectMetrics(project=p, urlmetrics=um, is_checked=False) pm.save() else: if len(ums) > 1: for dum in ums[1:]: dup_removed += 1 dum.delete() um = ums[0] pm = ProjectMetrics(project=p, urlmetrics=um) if um.is_uptodate(): pm.is_checked = True pm_checked += 1 else: pm.is_checked = False pm.save() pm_total += 1 self.stdout.write('Statistics:') self.stdout.write(' Project metric links: %d' % pm_total) self.stdout.write(' Checked/up-to-date: %d' % pm_checked) self.stdout.write(' Duplicate metric records removed: %d' % dup_removed) self.stdout.write(' Total metric records (post-removal): %d' % (len(URLMetrics.objects.all())))
def get_extensions(urlmetrics): """ Returns a list of URLMetrics that represent extensions (i.e. addition of 'www.') to the given URLMetrics. """ # ex_prefixes = ['www.'] ex_prefixes = [x.prefix+'.' for x in ExtensionPrefix.objects.all()] extensions = [] for ex in ex_prefixes: if urlmetrics.query_url.startswith(ex): # Skip if the given object already starts with the extension continue extension_url = ex+urlmetrics.query_url # Check if a URLMetrics object with the given extension already exists, otherwise create it try: exu = URLMetrics.objects.get(query_url=extension_url) except URLMetrics.DoesNotExist: exu = URLMetrics(query_url=extension_url) exu.extended_from = urlmetrics exu.save() extensions.append(exu) return extensions
def get_extensions(urlmetrics): """ Returns a list of URLMetrics that represent extensions (i.e. addition of 'www.') to the given URLMetrics. """ # ex_prefixes = ['www.'] ex_prefixes = [x.prefix + '.' for x in ExtensionPrefix.objects.all()] extensions = [] for ex in ex_prefixes: if urlmetrics.query_url.startswith(ex): # Skip if the given object already starts with the extension continue extension_url = ex + urlmetrics.query_url # Check if a URLMetrics object with the given extension already exists, otherwise create it try: exu = URLMetrics.objects.get(query_url=extension_url) except URLMetrics.DoesNotExist: exu = URLMetrics(query_url=extension_url) exu.extended_from = urlmetrics exu.save() extensions.append(exu) return extensions
def update_project_metrics(project_id): """ Updates all the URLMetrics associated with the given project id through the Moz API. If the MozRank of a URL is over the set threshold, extension URLs are created and also checked. Args: project_id (int): The ID of the project to update. """ p = UserProject.objects.get(id=project_id) # Retrieve all fields available with free Moz API registration cols = URLMetrics.create_cols_bitflag([ 'Title', 'Canonical URL', 'External Links', 'Links', 'MozRank 10', 'MozRank Raw', 'Subdomain MozRank 10', 'Subdomain MozRank Raw', 'HTTP Status Code', 'Page Authority', 'Domain Authority']) wait_time = AdminSetting.get_moz_api_wait_time() mozrank_extension_threshold = AdminSetting.get_value('mozrank_extension_threshold') associate_project_metrics(p) pmetrics = ProjectMetrics.objects.filter(project=p, is_checked=False) for pm in pmetrics: with transaction.atomic(): if not pm.urlmetrics.is_uptodate(): check_moz_domain(pm.urlmetrics, cols, wait_time) if not pm.is_extension and pm.urlmetrics.mozrank_10 >= mozrank_extension_threshold: extensions = get_extensions(pm.urlmetrics) print u'Getting extensions (%d)' % len(extensions) for ex in extensions: print u' %s' % ex.query_url try: newpm = ProjectMetrics.objects.get(project=p, urlmetrics=ex) except ProjectMetrics.DoesNotExist: newpm = ProjectMetrics(project=p, urlmetrics=ex, is_checked=True, is_extension=True) if not ex.is_uptodate(): print u' Checking extension: %s' % ex.query_url check_moz_domain(ex, cols, wait_time) else: print u' Extension already checked: %s' % ex.query_url newpm.is_checked = True newpm.save() pm.is_checked=True pm.save() p.update_state() p.save()
def update_project_metrics(project_id): """ Updates all the URLMetrics associated with the given project id through the Moz API. If the MozRank of a URL is over the set threshold, extension URLs are created and also checked. Args: project_id (int): The ID of the project to update. """ p = UserProject.objects.get(id=project_id) # Retrieve all fields available with free Moz API registration cols = URLMetrics.create_cols_bitflag([ 'Title', 'Canonical URL', 'External Links', 'Links', 'MozRank 10', 'MozRank Raw', 'Subdomain MozRank 10', 'Subdomain MozRank Raw', 'HTTP Status Code', 'Page Authority', 'Domain Authority' ]) wait_time = AdminSetting.get_moz_api_wait_time() mozrank_extension_threshold = AdminSetting.get_value( 'mozrank_extension_threshold') associate_project_metrics(p) pmetrics = ProjectMetrics.objects.filter(project=p, is_checked=False) for pm in pmetrics: with transaction.atomic(): if not pm.urlmetrics.is_uptodate(): check_moz_domain(pm.urlmetrics, cols, wait_time) if not pm.is_extension and pm.urlmetrics.mozrank_10 >= mozrank_extension_threshold: extensions = get_extensions(pm.urlmetrics) print u'Getting extensions (%d)' % len(extensions) for ex in extensions: print u' %s' % ex.query_url try: newpm = ProjectMetrics.objects.get(project=p, urlmetrics=ex) except ProjectMetrics.DoesNotExist: newpm = ProjectMetrics(project=p, urlmetrics=ex, is_checked=True, is_extension=True) if not ex.is_uptodate(): print u' Checking extension: %s' % ex.query_url check_moz_domain(ex, cols, wait_time) else: print u' Extension already checked: %s' % ex.query_url newpm.is_checked = True newpm.save() pm.is_checked = True pm.save() p.update_state() p.save()
def check_project_domains(project_id): """ Use the Namecheap API to update availability status for all the domains associated with the given project. Args: project_id (int): The ID of the project to check domains for. """ lock = NamecheapLock() project = UserProject.objects.get(id=project_id) # Enable debug output if settings.DEBUG: logging.basicConfig() logging.getLogger().setLevel(logging.DEBUG) requests_log = logging.getLogger(u'requests.packages.urllib3') requests_log.setLevel(logging.DEBUG) requests_log.propagate = True while True: lock.acquire() try: # Retrieve list of unchecked domains (limited by the set limit of domains per call) domain_list = project.projectdomain_set.filter(is_checked=False)[:AdminSetting.get_api_urls_per_request()] # If no domains unchecked, progress project to the next stage (usually metrics measuring) if domain_list.count() == 0: print u'No domains found.' project.update_state(save=False) project.save() lock.release() break # Fold the list into a dictionary for easy reference domains = dict([(d.domain, d) for d in domain_list]) domain_str = u','.join(domains.keys()) params = AdminSetting.get_api_params() params.append((u'Command', u'namecheap.domains.check')) params.append((u'DomainList', domain_str)) print u'Domains that will be checked: %s' % domain_str print params # Make the call to the Namecheap API (retry 3 times then fail) retries = 0 while True: try: r = requests.get(AdminSetting.get_api_url(), params=params) break except requests.exceptions.ConnectionError as ce: retries += 1 if retries >= 3: raise ce time.sleep(5) sc = r.status_code print u'Status code: %d' % sc if sc == 200: rxml = r.text.encode(u'utf-8') (domain_results, error_results) = parse_namecheap_result(rxml) if len(domain_results) == 0 and len(error_results) > 0: # Handle specific but rare Namecheap API errors gracefully for er in error_results: if int(er[u'number']) == 2030280: # TLD not found - assume same result for all for domain, d in domains.items(): d.state = u'error' d.error = u'API unable to parse TLD for this domain (possible encoding issue)' d.is_checked = True d.last_checked = timezone.now() d.save() break elif int(er[u'number']) == 3031510: # Denied authorization for this domain for domain, d in domains.items(): d.state = u'error' d.error = u'API denies authorisation to check this domain (reason not given)' d.is_checked = True d.last_checked = timezone.now() d.save() break else: # Assume catastrophic error error_str = u'the API backend returned the following unrecoverable error(s):\n\n' error_str += u'\n'.join([u' %d: [%s] %s' % (i+1, er[u'number'], er[u'description']) for i, er in enumerate(error_results)]) raise Exception(error_str) """ Match the call results to the domain list and store them. If appropriate, create and associate a metrics object for the project. """ for dr in domain_results: print u'Finding match for "%s"...' % (dr[u'domain']) for key in domains.keys(): # We use endswith to handle mailto: addresses, TODO: These should be handled at the parsing stage if key.endswith(dr[u'domain']): d = domains[key] if dr[u'errorno'] != 0: d.state = u'error' d.error = u'API error (%d): %s' % (dr[u'errorno'], dr[u'description']) print dr else: d.state = u'available' if dr[u'available'] else u'unavailable' d.description = None d.is_checked = True d.last_checked = timezone.now() d.save() if d.state == u'available': try: um = URLMetrics.objects.get(query_url=d.domain) except URLMetrics.DoesNotExist: um = URLMetrics(query_url=d.domain) um.save() pm = ProjectMetrics(project=project, urlmetrics=um, is_checked=False, is_extension=False) pm.save() break # Make a debug note if a requested domain does not appear in the results (likely an error occurred) for domain, d in domains.items(): if d.state == u'unchecked': print u'Domain result not found (will recheck later): %s' % domain else: print u'Warning: Unexpected response while calling API code: %d, will retry after delay' % sc r.close() time.sleep(AdminSetting.get_api_wait_time()) lock.release() except Exception as e: lock.release() # A fatal error has occurred, set the project state appropriately and send an email to the user. project.state = u'error' project.error = u'Error occurred while checking domains - %s' % str(e).encode('utf-8') project.updated = timezone.now() project.completed_datetime = timezone.now() project.save() reply_address = AdminSetting.get_value(u'noreply_address') server_address = AdminSetting.get_value(u'server_address') messagebody = (u'The project "%s" has encountered an error:\n\n' + \ u'%s\n\nYou can view the results at the following address:\n\n' + \ u'%s/project?id=%d\n\n' + \ u'Thank you for using Domain Checker.') % \ (project.name(), project.error, server_address, project.id) user = User.objects.get(id=project.user_id) send_mail(u'Domain Checker - Project "%s" Error' % (project.name(),), messagebody, reply_address, [user.email]) (exc_type, exc_value, exc_traceback) = sys.exc_info() admin_email = AdminSetting.get_value(u'admin_address') admin_messagebody = (u'The user "%s" has encountered an unrecoverable error for project id %d.\n\n%s') % \ (user.username, project.id, '\n'.join(traceback.format_exception(exc_type, exc_value, exc_traceback))) print admin_email print admin_messagebody send_mail(u'Domain Checker - User Unrecoverable Error', admin_messagebody, reply_address, [admin_email]) # Propagate error to Celery handler raise project.update_state() # If any domains require metrics retrieval, start the appropriate background task if project.state == u'measuring': update_project_metrics.delay(project.id)
def check_project_domains(project_id): """ Use the Namecheap API to update availability status for all the domains associated with the given project. Args: project_id (int): The ID of the project to check domains for. """ lock = NamecheapLock() project = UserProject.objects.get(id=project_id) # Enable debug output if settings.DEBUG: logging.basicConfig() logging.getLogger().setLevel(logging.DEBUG) requests_log = logging.getLogger(u'requests.packages.urllib3') requests_log.setLevel(logging.DEBUG) requests_log.propagate = True while True: lock.acquire() try: # Retrieve list of unchecked domains (limited by the set limit of domains per call) domain_list = project.projectdomain_set.filter( is_checked=False)[:AdminSetting.get_api_urls_per_request()] # If no domains unchecked, progress project to the next stage (usually metrics measuring) if domain_list.count() == 0: print u'No domains found.' project.update_state(save=False) project.save() lock.release() break # Fold the list into a dictionary for easy reference domains = dict([(d.domain, d) for d in domain_list]) domain_str = u','.join(domains.keys()) params = AdminSetting.get_api_params() params.append((u'Command', u'namecheap.domains.check')) params.append((u'DomainList', domain_str)) print u'Domains that will be checked: %s' % domain_str print params # Make the call to the Namecheap API (retry 3 times then fail) retries = 0 while True: try: r = requests.get(AdminSetting.get_api_url(), params=params) break except requests.exceptions.ConnectionError as ce: retries += 1 if retries >= 3: raise ce time.sleep(5) sc = r.status_code print u'Status code: %d' % sc if sc == 200: rxml = r.text.encode(u'utf-8') (domain_results, error_results) = parse_namecheap_result(rxml) if len(domain_results) == 0 and len(error_results) > 0: # Handle specific but rare Namecheap API errors gracefully for er in error_results: if int(er[u'number']) == 2030280: # TLD not found - assume same result for all for domain, d in domains.items(): d.state = u'error' d.error = u'API unable to parse TLD for this domain (possible encoding issue)' d.is_checked = True d.last_checked = timezone.now() d.save() break elif int(er[u'number']) == 3031510: # Denied authorization for this domain for domain, d in domains.items(): d.state = u'error' d.error = u'API denies authorisation to check this domain (reason not given)' d.is_checked = True d.last_checked = timezone.now() d.save() break else: # Assume catastrophic error error_str = u'the API backend returned the following unrecoverable error(s):\n\n' error_str += u'\n'.join([ u' %d: [%s] %s' % (i + 1, er[u'number'], er[u'description']) for i, er in enumerate(error_results) ]) raise Exception(error_str) """ Match the call results to the domain list and store them. If appropriate, create and associate a metrics object for the project. """ for dr in domain_results: print u'Finding match for "%s"...' % (dr[u'domain']) for key in domains.keys(): # We use endswith to handle mailto: addresses, TODO: These should be handled at the parsing stage if key.endswith(dr[u'domain']): d = domains[key] if dr[u'errorno'] != 0: d.state = u'error' d.error = u'API error (%d): %s' % ( dr[u'errorno'], dr[u'description']) print dr else: d.state = u'available' if dr[ u'available'] else u'unavailable' d.description = None d.is_checked = True d.last_checked = timezone.now() d.save() if d.state == u'available': try: um = URLMetrics.objects.get( query_url=d.domain) except URLMetrics.DoesNotExist: um = URLMetrics(query_url=d.domain) um.save() pm = ProjectMetrics(project=project, urlmetrics=um, is_checked=False, is_extension=False) pm.save() break # Make a debug note if a requested domain does not appear in the results (likely an error occurred) for domain, d in domains.items(): if d.state == u'unchecked': print u'Domain result not found (will recheck later): %s' % domain else: print u'Warning: Unexpected response while calling API code: %d, will retry after delay' % sc r.close() time.sleep(AdminSetting.get_api_wait_time()) lock.release() except Exception as e: lock.release() # A fatal error has occurred, set the project state appropriately and send an email to the user. project.state = u'error' project.error = u'Error occurred while checking domains - %s' % str( e).encode('utf-8') project.updated = timezone.now() project.completed_datetime = timezone.now() project.save() reply_address = AdminSetting.get_value(u'noreply_address') server_address = AdminSetting.get_value(u'server_address') messagebody = (u'The project "%s" has encountered an error:\n\n' + \ u'%s\n\nYou can view the results at the following address:\n\n' + \ u'%s/project?id=%d\n\n' + \ u'Thank you for using Domain Checker.') % \ (project.name(), project.error, server_address, project.id) user = User.objects.get(id=project.user_id) send_mail( u'Domain Checker - Project "%s" Error' % (project.name(), ), messagebody, reply_address, [user.email]) (exc_type, exc_value, exc_traceback) = sys.exc_info() admin_email = AdminSetting.get_value(u'admin_address') admin_messagebody = (u'The user "%s" has encountered an unrecoverable error for project id %d.\n\n%s') % \ (user.username, project.id, '\n'.join(traceback.format_exception(exc_type, exc_value, exc_traceback))) print admin_email print admin_messagebody send_mail(u'Domain Checker - User Unrecoverable Error', admin_messagebody, reply_address, [admin_email]) # Propagate error to Celery handler raise project.update_state() # If any domains require metrics retrieval, start the appropriate background task if project.state == u'measuring': update_project_metrics.delay(project.id)