Exemple #1
0
def check_failing_proxies_alert(proxy_list,
                                url='http://news.ycombinator.com',
                                receivers=['*****@*****.**']):
    if not already_run(6, PROXY_CHECKED_FILENAME):
        open(PROXY_CHECKED_FILENAME, 'w').close()

        check_proxy_list = []
        for proxy_url in proxy_list:
            try:
                urllib.urlopen(url, proxies={'http': proxy_url})
            except IOError:
                check_proxy_list.append(proxy_url)
            else:
                time.sleep(1)

        if check_proxy_list:

            body = ''
            for proxy_url in check_proxy_list:
                body += '%s\n' % proxy_url

            notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS,
                                     config.SMTP_FROM, config.SMTP_HOST,
                                     config.SMTP_PORT)
            notifier.send_notification(receivers,
                                       'Proxy Service - check proxy list',
                                       body)
Exemple #2
0
class UpdateValidator(object):
    check_category_changes_members = [69]

    def __init__(self):
        self._errors = []

        self.notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS,
                                      config.SMTP_FROM, config.SMTP_HOST,
                                      config.SMTP_PORT)

    @property
    def errors(self):
        return self._errors

    def __send_notification_to_dev(self, spider, errors):
        receivers = []
        subject = "Found delisted duplicates for spider %s" % spider.name
        body = u"There are delisted duplicates in last crawl of spider %s:\n" % spider.name
        if errors:
            for error in errors:
                body += u'\n' + error
            try:
                body = body.encode('utf-8')
                self.notifier.send_notification(receivers, subject, body)
            except EmailNotifierException, e:
                print "Failed sending notification: %s" % e
Exemple #3
0
def sites_not_uploaded(filename='sites_not_uploaded'):
    db_session = Session()
    conn = db_session.connection()

    sites_not_uploaded_list = []
    if os.path.exists(filename):
        with open(filename) as f:
            for site in f:
                try:
                    sites_not_uploaded_list.append(int(site.strip()))
                except:
                    continue

    all_not_uploaded_sites = conn.execute(
        text(
            'select s.id, s.website_id, s.name, s.not_uploaded_alert_receivers '
            'from spider s join account a on(s.account_id = a.id) '
            'where s.enabled and a.enabled and s.id in (select c.spider_id from crawl c join spider s2 on '
            '(c.spider_id = s2.id) join account a2 on (s2.account_id = a2.id) where s2.enabled and a2.enabled '
            'and c.status = \'upload_finished\' group by c.spider_id having date_part(\'day\', now() - max(c.end_time)) >= 2);'
        ))

    with open(filename, 'w') as f:
        for s in all_not_uploaded_sites:
            f.write('%s\n' % s['website_id'])

            last_successful_crawl = db_session.query(Crawl)\
                    .filter(Crawl.spider_id == s['id'], Crawl.status == 'upload_finished')\
                    .order_by(desc(Crawl.crawl_date)).first()
            if last_successful_crawl and last_successful_crawl.end_time:
                duration_error_state = datetime.now(
                ) - last_successful_crawl.end_time
            else:
                duration_error_state = None

            if duration_error_state and duration_error_state > timedelta(days=2)\
                and s['website_id'] not in sites_not_uploaded_list:

                if s['not_uploaded_alert_receivers']:
                    receivers = [
                        r.strip()
                        for r in s['not_uploaded_alert_receivers'].split(',')
                    ]
                    body = u'%s last uploaded %s days ago\n' % (
                        s['name'], duration_error_state.days)

                    notifier = EmailNotifier(config.SMTP_USER,
                                             config.SMTP_PASS,
                                             config.SMTP_FROM,
                                             config.SMTP_HOST,
                                             config.SMTP_PORT)
                    notifier.send_notification(
                        receivers,
                        'Spider has not uploaded for 2 or more days', body)

    db_session.close()
Exemple #4
0
def send_bsm_missing_full_run_alert(receivers):
    db_session = Session()

    spiders = db_session.query(Spider)\
        .join(Account)\
        .filter(Account.enabled == True,
                Spider.enabled == True,
                Spider.parse_method == 'BSM')

    yesterday_date = (datetime.today() - timedelta(days=1)).date()

    for spider in spiders:
        last_crawl = db_session.query(Crawl)\
            .filter(Crawl.spider_id == spider.id)\
            .order_by(Crawl.id.desc(),
                      Crawl.crawl_date.desc())\
            .limit(1)\
            .first()
        if not last_crawl:
            continue
        if not last_crawl.crawl_date:
            continue
        if last_crawl.crawl_date < yesterday_date:
            continue
        if spider.crawl_method2 and spider.crawl_method2.crawl_method == 'BigSiteMethod':
            if spider.crawl_method2._params:
                bsm_params = spider.crawl_method2.params
                if 'full_crawl_cron' not in bsm_params:
                    continue
                dom, m, dow = bsm_params['full_crawl_cron'].split()
                if is_cron_today(dom, m, dow, dt=yesterday_date):
                    yesterday_crawl = db_session.query(Crawl)\
                        .filter(Crawl.spider_id == spider.id,
                                Crawl.crawl_date == yesterday_date)\
                        .limit(1)\
                        .first()
                    if not yesterday_crawl:
                        account = db_session.query(Account).get(
                            spider.account_id)
                        body = u'Missing full run for spider with BSM enabled.\n\n'
                        body += u'Account name: %s\n' % account.name
                        body += u'Spider name: %s\n' % spider.name
                        body += u'Missing full run date: %s\n' % unicode(
                            yesterday_date)
                        notifier = EmailNotifier(config.SMTP_USER,
                                                 config.SMTP_PASS,
                                                 config.SMTP_FROM,
                                                 config.SMTP_HOST,
                                                 config.SMTP_PORT)
                        notifier.send_notification(
                            receivers,
                            '[WARNING] - Missing full run for Spider', body)

    db_session.close()
Exemple #5
0
def upload_crawls(db_session):
    notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS,
                             config.SMTP_FROM, config.SMTP_HOST,
                             config.SMTP_PORT)

    uploader = Uploader()
    crawls = db_session.query(Spider).join(Crawl).\
             filter(Spider.enabled == True, 
                    Crawl.status.in_(['processing_finished', 'upload_errors']))
    for spider in crawls.all():
        if upload_required(spider):
            if spider.crawls[-1].products_count < 1:
                print 'Not uploading crawl with 0 products'
                continue
            print 'Uploading for', spider.name
            try:
                upload_changes(uploader, spider)
                spider.crawls[-1].status = 'upload_finished'
                spider.crawls[-1].uploaded_time = datetime.now()
            except Exception:
                spider.crawls[-1].status = 'upload_errors'

            db_session.add(spider.crawls[-1])
            db_session.commit()

            try:
                _send_notification(notifier, spider.crawls[-1], spider)
            except EmailNotifierException, e:
                print "Failed to send notifications: %s" % e
Exemple #6
0
def send_report(proxies_report, spiders_report, days_back):
    notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS,
                             config.SMTP_FROM, config.SMTP_HOST,
                             config.SMTP_PORT)

    header = 'Report on proxy usage for last %d days' % days_back

    body = "Hello\n\nThis is automatic report of proxy traffic usage, generated for last %d days.\n" % days_back

    for proxy_name, traffic in proxies_report.items():
        body += "\n\n"
        body += "%s:\n\n" % proxy_name
        if proxy_name in usage_limits:
            if traffic > usage_limits[proxy_name]:
                body += "WARNING!!! Usage is too high: %0.4f GB (max allowed: %0.1f GB)\n" % \
                        ((float(traffic) / 1024 / 1024 / 1024), (float(usage_limits[proxy_name]) / 1024 / 1024 / 1024))
            else:
                body += "Usage is OK: %0.4f GB (max allowed: %0.1f GB)\n" % \
                        ((float(traffic) / 1024 / 1024 / 1024), (float(usage_limits[proxy_name]) / 1024 / 1024 / 1024))
        else:
            body += "Overall usage: %0.4f GB\n" % (float(traffic) / 1024 /
                                                   1024 / 1024)

        users = [(x, y[proxy_name]) for x, y in spiders_report.items()
                 if proxy_name in y]

        if users:
            body += "\n"
            body += "Most offensive spiders:\n"
            for spider, spider_traffic in sorted(users,
                                                 key=lambda x: x[1],
                                                 reverse=True)[:10]:
                body += "%s (%s): %0.4f GB\n" % (
                    spider.name, spider.account_name,
                    float(spider_traffic) / 1024 / 1024 / 1024)

    body += "\n\n"
    body += "Best regards"

    notifier.send_notification(emails, header, body)
Exemple #7
0
def send_bsm_missing_full_run_one_month_alert(receivers):
    db_session = Session()

    spiders = db_session.query(Spider)\
        .join(Account)\
        .filter(Account.enabled == True,
                Spider.enabled == True,
                Spider.parse_method == 'BSM')

    today_date = datetime.today().date()
    one_month_ago_date = datetime(
        day=today_date.day,
        month=(today_date.month - 1 if today_date.month != 1 else 12),
        year=(today_date.year -
              1 if today_date.month == 1 else today_date.year)).date()

    for spider in spiders:
        last_full_run_date = None
        spider_crawls = db_session.query(Crawl)\
            .filter(Crawl.spider_id == spider.id)\
            .order_by(Crawl.crawl_date.desc())
        for crawl in spider_crawls:
            if crawl.stats and crawl.stats.stats_json:
                crawl_stats = json.loads(crawl.stats.stats_json)
                if crawl_stats.get('BSM', False) and crawl_stats['full_run']:
                    last_full_run_date = crawl.crawl_date
                    break
        if last_full_run_date and (last_full_run_date < one_month_ago_date):
            account = db_session.query(Account).get(spider.account_id)
            body = u'Very old full run for spider with BSM enabled.\n\n'
            body += u'Account name: %s\n' % account.name
            body += u'Spider name: %s\n' % spider.name
            body += u'Last full run date: %s\n' % unicode(last_full_run_date)
            notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS,
                                     config.SMTP_FROM, config.SMTP_HOST,
                                     config.SMTP_PORT)
            notifier.send_notification(
                receivers, '[WARNING] - Very old full run for Spider', body)

    db_session.close()
Exemple #8
0
    def __init__(self):
        self._errors = []

        self.notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS,
                                      config.SMTP_FROM, config.SMTP_HOST,
                                      config.SMTP_PORT)
Exemple #9
0
def send_enabled_accounts_report(receivers):
    db_session = Session()

    header = [
        'Account', 'Number of spiders', 'Number of primary spiders',
        'Number of spiders using BSM',
        'Number of products in account (In Stock and Out of Stock)',
        'Number of matches (client SKUs)', 'Match rate', 'Main Offender',
        'Common Error Type'
    ]

    accounts = db_session.query(Account)\
        .filter(Account.enabled == True)

    api_host = ''
    api_key = '3Df7mNg'

    f = open(
        '/tmp/enabled_accounts_report_%s.csv' % str(time.time()).split('.')[0],
        'w')
    writer = csv.writer(f)
    writer.writerow(header)

    error_types = dict(ERROR_TYPES)

    for account in accounts:

        upload_dst = account.upload_destinations[
            0].name if account.upload_destinations else ''
        if upload_dst in config.new_system_api_roots:
            api_host = config.new_system_api_roots[upload_dst]
        else:
            continue

        compmon_api = Compmon2API(api_host, api_key)
        try:
            main_website_id = compmon_api.get_main_website_id(
                account.member_id)
            total_products = compmon_api.get_products_total_account(
                account.member_id)
            matched_products = compmon_api.get_matches_count_website(
                main_website_id)
            match_rate = compmon_api.get_match_rate_website(main_website_id)
        except:
            continue

        new_row = [account.name]
        spiders = db_session.query(Spider)\
            .filter(Spider.account_id == account.id,
                    Spider.enabled == True)
        account_spider_ids = [s.id for s in spiders]

        main_offender = ''
        main_error = db_session.query(SpiderError.spider_id, func.count(SpiderError.id).label('errors'))\
            .filter(SpiderError.spider_id.in_(account_spider_ids))\
            .group_by(SpiderError.spider_id).order_by(desc('errors')).first()
        if main_error:
            main_offender = '%s (%s)' % (db_session.query(Spider).get(
                main_error.spider_id).name, main_error.errors)

        common_error_type = ''
        main_error = db_session.query(SpiderError.error_type, func.count(SpiderError.id).label('errors'))\
            .filter(SpiderError.spider_id.in_(account_spider_ids))\
            .group_by(SpiderError.error_type).order_by(desc('errors')).first()
        if main_error:
            common_error_type = error_types[main_error.error_type]

        new_row.append(str(spiders.count()))
        new_row.append(
            str(spiders.filter(Spider.parse_method != 'Secondary').count()))
        new_row.append(
            str(spiders.filter(Spider.parse_method == 'BSM').count()))
        new_row.append(str(total_products))
        new_row.append(str(matched_products))
        new_row.append(str(match_rate))
        new_row.append(main_offender)
        new_row.append(common_error_type)

        writer.writerow(new_row)

    f.close()

    db_session.close()

    notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS,
                             config.SMTP_FROM, config.SMTP_HOST,
                             config.SMTP_PORT)
    notifier.send_notification(receivers,
                               'Enabled Accounts Report',
                               'Please find attached the report',
                               attachments=[f.name])
Exemple #10
0
def crawler_report(receivers):
    db_session = Session()
    # Total # of real and possible errors in the past 7 days
    today = date.today()
    to_ = today
    from_ = today - timedelta(days=6)
    daily_errors = db_session.query(DailyErrors).filter(
        DailyErrors.date.between(from_, to_)).order_by(DailyErrors.date)
    total_real_errors = 0
    total_possible_errors = 0
    for daily_stat in daily_errors:
        total_real_errors += int(daily_stat.real if daily_stat.real else 0)
        total_possible_errors += int(
            daily_stat.possible if daily_stat.possible else 0)
    # Average number of possible errors we had over the past 7 days
    possible_errors_avg = int(round(float(total_possible_errors) / float(7)))
    # Current number of real errors in the system
    current_real_errors_count = db_session.query(Spider)\
        .join(SpiderError).filter(SpiderError.status == 'real').count()
    # Top 5 sites With Errors
    spider_errors = db_session.query(SpiderError)\
        .filter(SpiderError.time_added < today,
                SpiderError.time_added >= (today - timedelta(days=30)))\
        .order_by(SpiderError.time_added)
    spiders_total_errors = {}
    error_types_total = {}
    for spider_error in spider_errors:
        if spider_error.spider_id not in spiders_total_errors:
            spiders_total_errors[spider_error.spider_id] = 1
        else:
            spiders_total_errors[spider_error.spider_id] += 1
        if spider_error.error_type != 'awaiting_feedback':
            if spider_error.error_type not in error_types_total:
                error_types_total[spider_error.error_type] = 1
            else:
                error_types_total[spider_error.error_type] += 1
    top_five_spiders = sorted(spiders_total_errors.items(),
                              key=lambda item: item[1],
                              reverse=True)[:5]
    top_five_types = sorted(error_types_total.items(),
                            key=lambda item: item[1],
                            reverse=True)[:5]

    conn = db_session.connection()

    current_day = from_
    total_last_updated_sites = 0
    while current_day <= today:
        last_updated_sites = conn.execute(text(
            'select count(s.id) from spider s join account a on(s.account_id = a.id) '
            'where s.enabled and (s.crawl_cron is null or s.crawl_cron = \'* * * * *\') and a.enabled and s.id in (select c.spider_id from crawl c join spider s2 on '
            '(c.spider_id = s2.id) join account a2 on (s2.account_id = a2.id) where s2.enabled and a2.enabled '
            'and c.status = \'upload_finished\' and c.end_time < :current_day group by c.spider_id having '
            'date_part(\'day\', :current_day - max(c.end_time)) >= 2);'),
                                          current_day=current_day).fetchone()
        total_last_updated_sites += int(last_updated_sites['count'])
        current_day += timedelta(days=1)
    last_updated_sites_avg = int(
        round(float(total_last_updated_sites) / float(7)))

    body = u'Here an overview about the crawlers status:\n\n'
    body += u'- Total # of Real Errors: %s' % total_real_errors
    body += u'\n- # Real Errors: %s' % current_real_errors_count
    body += u'\n- Average # of Possible Errors: %s' % possible_errors_avg
    body += u'\n- Top 5 sites With Errors:'
    for i, (sid, total) in enumerate(top_five_spiders):
        spider_name = db_session.query(Spider).get(sid).name
        body += u'\n\t%s. %s (%s)' % (i + 1, spider_name, total)
    body += '\n- Top 5 Errors Types'
    for i, (tid, total) in enumerate(top_five_types):
        type_name = ERROR_TYPES_DICT[tid]
        body += u'\n\t%s. %s (%s)' % (i + 1, type_name, total)
    body += '\n- Average # Of sites Not updated in 48 hours: %s' % last_updated_sites_avg

    notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS,
                             config.SMTP_FROM, config.SMTP_HOST,
                             config.SMTP_PORT)
    notifier.send_notification(receivers, 'Crawlers Weekly Report', body)

    db_session.close()
Exemple #11
0
def sites_not_uploaded_account_2(receivers, account_id, subject):
    db_session = Session()

    sites_not_uploaded_list = []

    spiders = db_session.query(Spider)\
        .filter(Spider.account_id == int(account_id),
                Spider.enabled == True)

    for spider in spiders:

        last_crawl = db_session.query(Crawl)\
            .filter(Crawl.spider_id == spider.id)\
            .order_by(Crawl.crawl_date.desc(),
                      desc(Crawl.id)).limit(1).first()

        last_successful_crawl = db_session.query(Crawl)\
            .filter(Crawl.spider_id == spider.id,
                    Crawl.status == 'upload_finished')\
            .order_by(Crawl.crawl_date.desc(),
                      desc(Crawl.id)).limit(1).first()

        if last_crawl.status != 'upload_finished':
            last_updated = last_successful_crawl.crawl_date
            if spider.error and spider.error.status != 'fixed':
                if spider.error.error_desc:
                    real_error = spider.error.error_desc
                else:
                    real_error = ERROR_TYPES_DICT[spider.error.error_type]
                if spider.error.assigned_to_id:
                    assigned_to = db_session.query(Developer).get(
                        spider.error.assigned_to_id)
                else:
                    assigned_to = None
            else:
                real_error = ''
                assigned_to = None

            sites_not_uploaded_list.append({
                'spider_name':
                spider.name,
                'last_uploaded':
                last_updated.strftime("%d-%m-%Y"),
                'error_type':
                real_error,
                'assigned_to':
                assigned_to.name if assigned_to else '',
                'status':
                last_crawl.status,
            })

    body = ''

    for site_data in sites_not_uploaded_list:
        body += (u'Spider: %(spider_name)s\n'
                 u'Status: %(status)s\n'
                 u'Last Upload: %(last_uploaded)s\n'
                 u'Errors Type: %(error_type)s\n'
                 u'Dev: %(assigned_to)s\n\n') % site_data

    if not sites_not_uploaded_list:
        body = u'All spiders have been uploaded'

    notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS,
                             config.SMTP_FROM, config.SMTP_HOST,
                             config.SMTP_PORT)
    notifier.send_notification(receivers, subject, body)

    db_session.close()
Exemple #12
0
def send_report(res,
                spiders_str,
                period_str,
                display_traffic_threshold,
                emails,
                display_domains_below_threshold=False):
    notifier = EmailNotifier(config.SMTP_USER, config.SMTP_PASS,
                             config.SMTP_FROM, config.SMTP_HOST,
                             config.SMTP_PORT)

    header = 'Report on %s traffic for %s' % (spiders_str, period_str)

    body = "Hello\n\nThis is automatic report of %s traffic usage by spiders, generated for %s.\n" % (
        spiders_str, period_str)

    total_traffic = sum([data['traffic'] for data in res.values()])
    body += "Total traffic: %0.2f GB\n" % (total_traffic / 1024 / 1024 /
                                           1024, )

    if res:
        domains = set([data['domain'] for data in res.values()])
        domains_traffic = {
            domain: sum([
                data['traffic'] for data in res.values()
                if data['domain'] == domain
            ])
            for domain in domains
        }
        sorted_domains = sorted(domains_traffic,
                                key=lambda x: domains_traffic[x],
                                reverse=True)
        for domain in sorted_domains:
            res_domain = {
                spider: data
                for spider, data in res.items() if data['domain'] == domain
            }
            sorted_spiders = sorted(res_domain,
                                    key=lambda x: res_domain[x]['traffic'],
                                    reverse=True)
            total_traffic = sum(
                [data['traffic'] for data in res_domain.values()])
            above_threshold = total_traffic > display_traffic_threshold
            display_spider_traffic_threshold = display_traffic_threshold / 2
            spider_above_threshold = res_domain[sorted_spiders[0]][
                'traffic'] > display_spider_traffic_threshold

            if above_threshold or display_domains_below_threshold:
                body += "\n\n"
                body += "Domain: %s\n" % domain

                body += "Total traffic: %0.2f GB\n" % (total_traffic / 1024 /
                                                       1024 / 1024, )

            if spider_above_threshold:
                for i, spider in enumerate(sorted_spiders, 1):
                    data = res_domain[spider]
                    if data['traffic'] < display_traffic_threshold:
                        break
                    body += "%d. %s: %0.2f GB\n" % (
                        i, spider, data['traffic'] / 1024 / 1024 / 1024)
    else:
        body += "No traffic"

    body += "\n\n"
    body += "Best regards"

    notifier.send_notification(emails, header, body)