Example #1
0
def associate_project_metrics(project):
    """
    Checks to ensure that all the parsed domains of the given project are represented by a URLMetrics association.  A new URLMetrics object is created if an appropriate one does not exist.

    Args:
      project (UserProject): The project to check.
    """
    for pd in project.projectdomain_set.all():
        metric_associated = False
        for um in project.urlmetrics.all():
            if um.query_url == pd.domain:
                metric_associated = True
                break
        if not metric_associated:
            newum = None
            try:
                newum = URLMetrics.objects.get(query_url=pd.domain)
            except URLMetrics.DoesNotExist:
                newum = URLMetrics(query_url=query_url.pd_domain)
                newum.save()
            pm = ProjectMetrics(project=project,
                                urlmetrics=newum,
                                is_checked=False,
                                is_extension=False)
            pm.save()
Example #2
0
    def handle(self, *args, **options):
        pm_total = 0
        pm_checked = 0
        dup_removed = 0
        with transaction.atomic():
            for p in UserProject.objects.all():
                for pm in ProjectMetrics.objects.filter(project=p,
                                                        is_extension=False):
                    pm.delete()

                for pm in ProjectMetrics.objects.filter(project=p,
                                                        is_extension=True):
                    if pm.urlmetrics.is_uptodate():
                        pm_checked += 1
                        pm.is_checked = True
                    else:
                        pm.is_checked = False

                for pd in p.projectdomain_set.filter(state=u'available'):
                    ums = URLMetrics.objects.filter(
                        query_url=pd.domain).order_by(u'-last_updated')
                    if len(ums) == 0:
                        um = URLMetrics(query_url=pd.domain)
                        um.save()
                        pm = ProjectMetrics(project=p,
                                            urlmetrics=um,
                                            is_checked=False)
                        pm.save()
                    else:
                        if len(ums) > 1:
                            for dum in ums[1:]:
                                dup_removed += 1
                                dum.delete()
                        um = ums[0]
                        pm = ProjectMetrics(project=p, urlmetrics=um)
                        if um.is_uptodate():
                            pm.is_checked = True
                            pm_checked += 1
                        else:
                            pm.is_checked = False
                        pm.save()
                    pm_total += 1
        self.stdout.write('Statistics:')
        self.stdout.write('  Project metric links: %d' % pm_total)
        self.stdout.write('      Checked/up-to-date: %d' % pm_checked)
        self.stdout.write('  Duplicate metric records removed: %d' %
                          dup_removed)
        self.stdout.write('  Total metric records (post-removal): %d' %
                          (len(URLMetrics.objects.all())))
Example #3
0
def get_extensions(urlmetrics):
    """
    Returns a list of URLMetrics that represent extensions (i.e. addition of 'www.') to the given URLMetrics.
    """
    # ex_prefixes = ['www.']
    ex_prefixes = [x.prefix+'.' for x in ExtensionPrefix.objects.all()]
    extensions = []
    for ex in ex_prefixes:
        if urlmetrics.query_url.startswith(ex):
            # Skip if the given object already starts with the extension
            continue
        extension_url = ex+urlmetrics.query_url
        # Check if a URLMetrics object with the given extension already exists, otherwise create it
        try:
            exu = URLMetrics.objects.get(query_url=extension_url)
        except URLMetrics.DoesNotExist:
            exu = URLMetrics(query_url=extension_url)
        exu.extended_from = urlmetrics
        exu.save()
        extensions.append(exu)
    return extensions
Example #4
0
def get_extensions(urlmetrics):
    """
    Returns a list of URLMetrics that represent extensions (i.e. addition of 'www.') to the given URLMetrics.
    """
    # ex_prefixes = ['www.']
    ex_prefixes = [x.prefix + '.' for x in ExtensionPrefix.objects.all()]
    extensions = []
    for ex in ex_prefixes:
        if urlmetrics.query_url.startswith(ex):
            # Skip if the given object already starts with the extension
            continue
        extension_url = ex + urlmetrics.query_url
        # Check if a URLMetrics object with the given extension already exists, otherwise create it
        try:
            exu = URLMetrics.objects.get(query_url=extension_url)
        except URLMetrics.DoesNotExist:
            exu = URLMetrics(query_url=extension_url)
        exu.extended_from = urlmetrics
        exu.save()
        extensions.append(exu)
    return extensions
Example #5
0
def associate_project_metrics(project):
    """
    Checks to ensure that all the parsed domains of the given project are represented by a URLMetrics association.  A new URLMetrics object is created if an appropriate one does not exist.

    Args:
      project (UserProject): The project to check.
    """
    for pd in project.projectdomain_set.all():
        metric_associated = False
        for um in project.urlmetrics.all():
            if um.query_url == pd.domain:
                metric_associated = True
                break
        if not metric_associated:
            newum = None
            try:
                newum = URLMetrics.objects.get(query_url=pd.domain)
            except URLMetrics.DoesNotExist:
                newum = URLMetrics(query_url=query_url.pd_domain)
                newum.save()
            pm = ProjectMetrics(project=project, urlmetrics=newum, is_checked=False, is_extension=False)
            pm.save()
Example #6
0
def update_project_metrics(project_id):
    """
    Updates all the URLMetrics associated with the given project id through the Moz API.  If the MozRank of a URL is over the set threshold, extension URLs are created and also checked.

    Args:
      project_id (int): The ID of the project to update.
    """
    p = UserProject.objects.get(id=project_id)
    # Retrieve all fields available with free Moz API registration
    cols = URLMetrics.create_cols_bitflag([
        'Title',
        'Canonical URL',
        'External Links',
        'Links',
        'MozRank 10', 
        'MozRank Raw',
        'Subdomain MozRank 10',
        'Subdomain MozRank Raw',
        'HTTP Status Code',
        'Page Authority',
        'Domain Authority'])
    wait_time = AdminSetting.get_moz_api_wait_time()
    mozrank_extension_threshold = AdminSetting.get_value('mozrank_extension_threshold')
    associate_project_metrics(p)
    pmetrics = ProjectMetrics.objects.filter(project=p, is_checked=False)
    for pm in pmetrics:
        with transaction.atomic():
            if not pm.urlmetrics.is_uptodate():
                check_moz_domain(pm.urlmetrics, cols, wait_time)
            if not pm.is_extension and pm.urlmetrics.mozrank_10 >= mozrank_extension_threshold:
                extensions = get_extensions(pm.urlmetrics)
                print u'Getting extensions (%d)' % len(extensions)
                for ex in extensions:
                    print u'  %s' % ex.query_url
                    try:
                        newpm = ProjectMetrics.objects.get(project=p, urlmetrics=ex)
                    except ProjectMetrics.DoesNotExist:
                        newpm = ProjectMetrics(project=p, urlmetrics=ex, is_checked=True, is_extension=True)
                    if not ex.is_uptodate():
                        print u'  Checking extension: %s' % ex.query_url
                        check_moz_domain(ex, cols, wait_time)
                    else:
                        print u'  Extension already checked: %s' % ex.query_url
                    newpm.is_checked = True
                    newpm.save()
                
            pm.is_checked=True
            pm.save()
    p.update_state()
    p.save()
Example #7
0
def update_project_metrics(project_id):
    """
    Updates all the URLMetrics associated with the given project id through the Moz API.  If the MozRank of a URL is over the set threshold, extension URLs are created and also checked.

    Args:
      project_id (int): The ID of the project to update.
    """
    p = UserProject.objects.get(id=project_id)
    # Retrieve all fields available with free Moz API registration
    cols = URLMetrics.create_cols_bitflag([
        'Title', 'Canonical URL', 'External Links', 'Links', 'MozRank 10',
        'MozRank Raw', 'Subdomain MozRank 10', 'Subdomain MozRank Raw',
        'HTTP Status Code', 'Page Authority', 'Domain Authority'
    ])
    wait_time = AdminSetting.get_moz_api_wait_time()
    mozrank_extension_threshold = AdminSetting.get_value(
        'mozrank_extension_threshold')
    associate_project_metrics(p)
    pmetrics = ProjectMetrics.objects.filter(project=p, is_checked=False)
    for pm in pmetrics:
        with transaction.atomic():
            if not pm.urlmetrics.is_uptodate():
                check_moz_domain(pm.urlmetrics, cols, wait_time)
            if not pm.is_extension and pm.urlmetrics.mozrank_10 >= mozrank_extension_threshold:
                extensions = get_extensions(pm.urlmetrics)
                print u'Getting extensions (%d)' % len(extensions)
                for ex in extensions:
                    print u'  %s' % ex.query_url
                    try:
                        newpm = ProjectMetrics.objects.get(project=p,
                                                           urlmetrics=ex)
                    except ProjectMetrics.DoesNotExist:
                        newpm = ProjectMetrics(project=p,
                                               urlmetrics=ex,
                                               is_checked=True,
                                               is_extension=True)
                    if not ex.is_uptodate():
                        print u'  Checking extension: %s' % ex.query_url
                        check_moz_domain(ex, cols, wait_time)
                    else:
                        print u'  Extension already checked: %s' % ex.query_url
                    newpm.is_checked = True
                    newpm.save()

            pm.is_checked = True
            pm.save()
    p.update_state()
    p.save()
Example #8
0
def check_project_domains(project_id):
    """
    Use the Namecheap API to update availability status for all the domains associated with the given project.

    Args:
      project_id (int): The ID of the project to check domains for.
    """
    lock = NamecheapLock()
    project = UserProject.objects.get(id=project_id)
    # Enable debug output
    if settings.DEBUG:
        logging.basicConfig() 
        logging.getLogger().setLevel(logging.DEBUG)
        requests_log = logging.getLogger(u'requests.packages.urllib3')
        requests_log.setLevel(logging.DEBUG)
        requests_log.propagate = True
    while True:
        lock.acquire()
        try:
            # Retrieve list of unchecked domains (limited by the set limit of domains per call)
            domain_list = project.projectdomain_set.filter(is_checked=False)[:AdminSetting.get_api_urls_per_request()]
            # If no domains unchecked, progress project to the next stage (usually metrics measuring)
            if domain_list.count() == 0:
                print u'No domains found.'
                project.update_state(save=False)
                project.save()

                lock.release()
                break

            # Fold the list into a dictionary for easy reference
            domains = dict([(d.domain, d) for d in domain_list])
            domain_str = u','.join(domains.keys())

            params = AdminSetting.get_api_params()
            params.append((u'Command', u'namecheap.domains.check'))
            params.append((u'DomainList', domain_str))

            print u'Domains that will be checked: %s' % domain_str
            print params

            # Make the call to the Namecheap API (retry 3 times then fail)
            retries = 0
            while True:
                try:
                    r = requests.get(AdminSetting.get_api_url(), params=params)
                    break
                except requests.exceptions.ConnectionError as ce:
                    retries += 1
                    if retries >= 3:
                        raise ce
                    time.sleep(5)

            sc = r.status_code
            print u'Status code: %d' % sc

            if sc == 200:
                rxml = r.text.encode(u'utf-8')
                (domain_results, error_results) = parse_namecheap_result(rxml)
                if len(domain_results) == 0 and len(error_results) > 0:
                    # Handle specific but rare Namecheap API errors gracefully
                    for er in error_results:
                        if int(er[u'number']) == 2030280:
                            # TLD not found - assume same result for all
                            for domain, d in domains.items():
                                d.state = u'error'
                                d.error = u'API unable to parse TLD for this domain (possible encoding issue)'
                                d.is_checked = True
                                d.last_checked = timezone.now()
                                d.save()
                            break
                        elif int(er[u'number']) == 3031510:
                            # Denied authorization for this domain
                            for domain, d in domains.items():
                                d.state = u'error'
                                d.error = u'API denies authorisation to check this domain (reason not given)'
                                d.is_checked = True
                                d.last_checked = timezone.now()
                                d.save()
                            break
                        else:
                            # Assume catastrophic error
                            error_str = u'the API backend returned the following unrecoverable error(s):\n\n'
                            error_str += u'\n'.join([u'  %d: [%s] %s' % (i+1, er[u'number'], er[u'description']) for i, er in enumerate(error_results)])
                            raise Exception(error_str)

                """
                Match the call results to the domain list and store them.  If appropriate, create and associate a metrics object for the project.
                """
                for dr in domain_results:
                    print u'Finding match for "%s"...' % (dr[u'domain'])
                    for key in domains.keys():
                        # We use endswith to handle mailto: addresses, TODO: These should be handled at the parsing stage
                        if key.endswith(dr[u'domain']):
                            d = domains[key]
                            if dr[u'errorno'] != 0:
                                d.state = u'error'
                                d.error = u'API error (%d): %s' % (dr[u'errorno'], dr[u'description'])
                                print dr
                            else:
                                d.state = u'available' if dr[u'available'] else u'unavailable'
                                d.description = None
                            d.is_checked = True
                            d.last_checked = timezone.now()
                            d.save()
                            if d.state == u'available':
                                try:
                                    um = URLMetrics.objects.get(query_url=d.domain)
                                except URLMetrics.DoesNotExist:
                                    um = URLMetrics(query_url=d.domain)
                                    um.save()
                                pm = ProjectMetrics(project=project, urlmetrics=um, is_checked=False, is_extension=False)
                                pm.save()
                            break

                # Make a debug note if a requested domain does not appear in the results (likely an error occurred)
                for domain, d in domains.items():
                    if d.state == u'unchecked':
                        print u'Domain result not found (will recheck later): %s' % domain
            else:
                print u'Warning: Unexpected response while calling API code: %d, will retry after delay' % sc

            r.close()
            time.sleep(AdminSetting.get_api_wait_time())
            lock.release()
        except Exception as e:
            lock.release()

            # A fatal error has occurred, set the project state appropriately and send an email to the user.
            project.state = u'error'
            project.error = u'Error occurred while checking domains - %s' % str(e).encode('utf-8')
            project.updated = timezone.now()
            project.completed_datetime = timezone.now()
            project.save()
            reply_address = AdminSetting.get_value(u'noreply_address')
            server_address = AdminSetting.get_value(u'server_address')
            messagebody = (u'The project "%s" has encountered an error:\n\n' + \
                  u'%s\n\nYou can view the results at the following address:\n\n' + \
                  u'%s/project?id=%d\n\n' + \
                  u'Thank you for using Domain Checker.') % \
                  (project.name(), project.error, server_address, project.id)
            user = User.objects.get(id=project.user_id)
            send_mail(u'Domain Checker - Project "%s" Error' % (project.name(),), messagebody, reply_address, [user.email])

            (exc_type, exc_value, exc_traceback) = sys.exc_info()
            admin_email = AdminSetting.get_value(u'admin_address')
            admin_messagebody = (u'The user "%s" has encountered an unrecoverable error for project id %d.\n\n%s') % \
                (user.username, project.id, '\n'.join(traceback.format_exception(exc_type, exc_value, exc_traceback)))
            print admin_email
            print admin_messagebody
                
            send_mail(u'Domain Checker - User Unrecoverable Error', admin_messagebody, reply_address, [admin_email])

            # Propagate error to Celery handler
            raise

        project.update_state()
        # If any domains require metrics retrieval, start the appropriate background task
        if project.state == u'measuring':
            update_project_metrics.delay(project.id)
Example #9
0
def check_project_domains(project_id):
    """
    Use the Namecheap API to update availability status for all the domains associated with the given project.

    Args:
      project_id (int): The ID of the project to check domains for.
    """
    lock = NamecheapLock()
    project = UserProject.objects.get(id=project_id)
    # Enable debug output
    if settings.DEBUG:
        logging.basicConfig()
        logging.getLogger().setLevel(logging.DEBUG)
        requests_log = logging.getLogger(u'requests.packages.urllib3')
        requests_log.setLevel(logging.DEBUG)
        requests_log.propagate = True
    while True:
        lock.acquire()
        try:
            # Retrieve list of unchecked domains (limited by the set limit of domains per call)
            domain_list = project.projectdomain_set.filter(
                is_checked=False)[:AdminSetting.get_api_urls_per_request()]
            # If no domains unchecked, progress project to the next stage (usually metrics measuring)
            if domain_list.count() == 0:
                print u'No domains found.'
                project.update_state(save=False)
                project.save()

                lock.release()
                break

            # Fold the list into a dictionary for easy reference
            domains = dict([(d.domain, d) for d in domain_list])
            domain_str = u','.join(domains.keys())

            params = AdminSetting.get_api_params()
            params.append((u'Command', u'namecheap.domains.check'))
            params.append((u'DomainList', domain_str))

            print u'Domains that will be checked: %s' % domain_str
            print params

            # Make the call to the Namecheap API (retry 3 times then fail)
            retries = 0
            while True:
                try:
                    r = requests.get(AdminSetting.get_api_url(), params=params)
                    break
                except requests.exceptions.ConnectionError as ce:
                    retries += 1
                    if retries >= 3:
                        raise ce
                    time.sleep(5)

            sc = r.status_code
            print u'Status code: %d' % sc

            if sc == 200:
                rxml = r.text.encode(u'utf-8')
                (domain_results, error_results) = parse_namecheap_result(rxml)
                if len(domain_results) == 0 and len(error_results) > 0:
                    # Handle specific but rare Namecheap API errors gracefully
                    for er in error_results:
                        if int(er[u'number']) == 2030280:
                            # TLD not found - assume same result for all
                            for domain, d in domains.items():
                                d.state = u'error'
                                d.error = u'API unable to parse TLD for this domain (possible encoding issue)'
                                d.is_checked = True
                                d.last_checked = timezone.now()
                                d.save()
                            break
                        elif int(er[u'number']) == 3031510:
                            # Denied authorization for this domain
                            for domain, d in domains.items():
                                d.state = u'error'
                                d.error = u'API denies authorisation to check this domain (reason not given)'
                                d.is_checked = True
                                d.last_checked = timezone.now()
                                d.save()
                            break
                        else:
                            # Assume catastrophic error
                            error_str = u'the API backend returned the following unrecoverable error(s):\n\n'
                            error_str += u'\n'.join([
                                u'  %d: [%s] %s' %
                                (i + 1, er[u'number'], er[u'description'])
                                for i, er in enumerate(error_results)
                            ])
                            raise Exception(error_str)
                """
                Match the call results to the domain list and store them.  If appropriate, create and associate a metrics object for the project.
                """
                for dr in domain_results:
                    print u'Finding match for "%s"...' % (dr[u'domain'])
                    for key in domains.keys():
                        # We use endswith to handle mailto: addresses, TODO: These should be handled at the parsing stage
                        if key.endswith(dr[u'domain']):
                            d = domains[key]
                            if dr[u'errorno'] != 0:
                                d.state = u'error'
                                d.error = u'API error (%d): %s' % (
                                    dr[u'errorno'], dr[u'description'])
                                print dr
                            else:
                                d.state = u'available' if dr[
                                    u'available'] else u'unavailable'
                                d.description = None
                            d.is_checked = True
                            d.last_checked = timezone.now()
                            d.save()
                            if d.state == u'available':
                                try:
                                    um = URLMetrics.objects.get(
                                        query_url=d.domain)
                                except URLMetrics.DoesNotExist:
                                    um = URLMetrics(query_url=d.domain)
                                    um.save()
                                pm = ProjectMetrics(project=project,
                                                    urlmetrics=um,
                                                    is_checked=False,
                                                    is_extension=False)
                                pm.save()
                            break

                # Make a debug note if a requested domain does not appear in the results (likely an error occurred)
                for domain, d in domains.items():
                    if d.state == u'unchecked':
                        print u'Domain result not found (will recheck later): %s' % domain
            else:
                print u'Warning: Unexpected response while calling API code: %d, will retry after delay' % sc

            r.close()
            time.sleep(AdminSetting.get_api_wait_time())
            lock.release()
        except Exception as e:
            lock.release()

            # A fatal error has occurred, set the project state appropriately and send an email to the user.
            project.state = u'error'
            project.error = u'Error occurred while checking domains - %s' % str(
                e).encode('utf-8')
            project.updated = timezone.now()
            project.completed_datetime = timezone.now()
            project.save()
            reply_address = AdminSetting.get_value(u'noreply_address')
            server_address = AdminSetting.get_value(u'server_address')
            messagebody = (u'The project "%s" has encountered an error:\n\n' + \
                  u'%s\n\nYou can view the results at the following address:\n\n' + \
                  u'%s/project?id=%d\n\n' + \
                  u'Thank you for using Domain Checker.') % \
                  (project.name(), project.error, server_address, project.id)
            user = User.objects.get(id=project.user_id)
            send_mail(
                u'Domain Checker - Project "%s" Error' % (project.name(), ),
                messagebody, reply_address, [user.email])

            (exc_type, exc_value, exc_traceback) = sys.exc_info()
            admin_email = AdminSetting.get_value(u'admin_address')
            admin_messagebody = (u'The user "%s" has encountered an unrecoverable error for project id %d.\n\n%s') % \
                (user.username, project.id, '\n'.join(traceback.format_exception(exc_type, exc_value, exc_traceback)))
            print admin_email
            print admin_messagebody

            send_mail(u'Domain Checker - User Unrecoverable Error',
                      admin_messagebody, reply_address, [admin_email])

            # Propagate error to Celery handler
            raise

        project.update_state()
        # If any domains require metrics retrieval, start the appropriate background task
        if project.state == u'measuring':
            update_project_metrics.delay(project.id)