Exemple #1
0
class GuguPipeline(object):

  def __init__(self, mail_to):
    self.mailer = MailSender()
    self.mail_to = mail_to
    if mail_to:
      log.msg('Emails will be sent to %s' % mail_to, level=logging.INFO)

  @classmethod
  def from_settings(cls, settings):
    mail_to = settings['GUGU_PIPELINE_MAIL_TO']
    return cls(mail_to)

  def process_item(self, item, spider):
    if re.search(GUGU_PATTERN, item['lyrics']):
      item['match'] = 'true'
      self.send_email(item)
    else:
      item['match'] = 'false'
    return item

  def send_email(self, item):
    if not self.mail_to:
      return
    subject = "Found a match: {artist} - {title}".format(**item)
    body = """URL: {url}

{lyrics}
""".format(**item)
    self.mailer.send(to=[self.mail_to], subject=subject, body=body)
Exemple #2
0
    def test_send_attach(self):
        attach = BytesIO()
        attach.write(b'content')
        attach.seek(0)
        attachs = [('attachment', 'text/plain', attach)]

        mailsender = MailSender(debug=True)
        mailsender.send(to=['*****@*****.**'], subject='subject', body='body',
                       attachs=attachs, _callback=self._catch_mail_sent)

        assert self.catched_msg
        self.assertEqual(self.catched_msg['to'], ['*****@*****.**'])
        self.assertEqual(self.catched_msg['subject'], 'subject')
        self.assertEqual(self.catched_msg['body'], 'body')

        msg = self.catched_msg['msg']
        self.assertEqual(msg['to'], '*****@*****.**')
        self.assertEqual(msg['subject'], 'subject')

        payload = msg.get_payload()
        assert isinstance(payload, list)
        self.assertEqual(len(payload), 2)

        text, attach = payload
        self.assertEqual(text.get_payload(decode=True), b'body')
        self.assertEqual(text.get_charset(), Charset('us-ascii'))
        self.assertEqual(attach.get_payload(decode=True), b'content')
Exemple #3
0
 def stats_spider_closed(self, spider, spider_stats):
     mail = MailSender()
     body = "Global stats\n\n"
     body += "\n".join("%-50s : %s" % i for i in stats.get_stats().items())
     body += "\n\n%s stats\n\n" % spider.name
     body += "\n".join("%-50s : %s" % i for i in spider_stats.items())
     mail.send(self.recipients, "Scrapy stats for: %s" % spider.name, body)
Exemple #4
0
    def test_send_attach_utf8(self):
        subject = u'sübjèçt'
        body = u'bödÿ-àéïöñß'
        attach = BytesIO()
        attach.write(body.encode('utf-8'))
        attach.seek(0)
        attachs = [('attachment', 'text/plain', attach)]

        mailsender = MailSender(debug=True)
        mailsender.send(to=['*****@*****.**'], subject=subject, body=body,
                        attachs=attachs, charset='utf-8', _callback=self._catch_mail_sent)

        assert self.catched_msg
        self.assertEqual(self.catched_msg['subject'], subject)
        self.assertEqual(self.catched_msg['body'], body)

        msg = self.catched_msg['msg']
        self.assertEqual(msg['subject'], subject)
        self.assertEqual(msg.get_charset(), Charset('utf-8'))
        self.assertEqual(msg.get('Content-Type'), 'multipart/mixed; charset="utf-8"')

        payload = msg.get_payload()
        assert isinstance(payload, list)
        self.assertEqual(len(payload), 2)

        text, attach = payload
        self.assertEqual(text.get_payload(decode=True).decode('utf-8'), body)
        self.assertEqual(text.get_charset(), Charset('utf-8'))
        self.assertEqual(attach.get_payload(decode=True).decode('utf-8'), body)
Exemple #5
0
    def test_send_html(self):
        mailsender = MailSender(debug=True)
        mailsender.send(to=['*****@*****.**'], subject='subject', body='<p>body</p>', mimetype='text/html', _callback=self._catch_mail_sent)

        msg = self.catched_msg['msg']
        self.assertEqual(msg.get_payload(), '<p>body</p>')
        self.assertEqual(msg.get('Content-Type'), 'text/html')
Exemple #6
0
    def test_send_attach(self):
        attach = StringIO()
        attach.write('content')
        attach.seek(0)
        attachs = [('attachment', 'text/plain', attach)]

        mailsender = MailSender(debug=True, crawler=self.crawler)
        mailsender.send(to=['*****@*****.**'], subject='subject', body='body',
                       attachs=attachs)

        assert self.catched_msg
        self.assertEqual(self.catched_msg['to'], ['*****@*****.**'])
        self.assertEqual(self.catched_msg['subject'], 'subject')
        self.assertEqual(self.catched_msg['body'], 'body')

        msg = self.catched_msg['msg']
        self.assertEqual(msg['to'], '*****@*****.**')
        self.assertEqual(msg['subject'], 'subject')

        payload = msg.get_payload()
        assert isinstance(payload, list)
        self.assertEqual(len(payload), 2)

        text, attach = payload
        self.assertEqual(text.get_payload(decode=True), 'body')
        self.assertEqual(attach.get_payload(decode=True), 'content')
class SpiderOpenCloseLogging(object):

    def __init__(self):
        dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
        self.mailer = MailSender()
        self.mailer.smtphost = "smtp.sina.cn"
        self.mailer.smtpuser = "******"
        self.mailer.smtppass = "******"
        self.mailer.mailfrom = "*****@*****.**"

    def spider_opened(self, spider):
        log.msg("opened spider %s" % spider.name)
        self.mailer.send(to=["*****@*****.**"], subject="scrapy running", body="scrapy is start")

    def spider_closed(self, spider):
        if spider.domain:
            param =(spider.sales_num,spider.money,spider.queue_id)
            spider.cur.execute("update admin_queue set sales=%s , money=%s where id=%s",param)
            spider.conn.commit()
            spider.cur.close()
            spider.conn.close()
            mail_content = str(spider.shop_name)+"\n"
            mail_content += "30天销量:"+str(spider.sales_num)+" \n30天成交额:"+str(spider.money)+"\n店铺地址:"+str(spider.domain)+"\n"
            mail_content+="---------------------------------------\n"
            mail_content+=spider.shopinfo_str
            mail_title = str(spider.shop_name) +' 数据报告'
            self.mailer.send(to=[str(spider.mailto)], subject=mail_title, body=mail_content)
        log.msg("closed spider %s" % spider.name)
Exemple #8
0
    def close(spider, reason):
        # send email when spider closed
        if spider.email_content.strip():
            mailer = MailSender(mailfrom="*****@*****.**", smtphost="smtp.gmail.com", smtpport=587, smtpuser="******",smtppass="******")
            mailer.send(to=["*****@*****.**", "*****@*****.**"], cc=["*****@*****.**"], subject= "[Movies Here] " + spider.keyword + " is coming!!!", body=spider.email_content)

        closed = getattr(spider, 'closed', None)
        if callable(closed):
            return closed(reason)
Exemple #9
0
 def spider_closed(self, spider):
     mailer = MailSender(mailfrom="*****@*****.**",
                         smtphost="smtp.gmail.com",
                         smtpport=587,
                         smtpuser="******",
                         smtppass="******")
     mailer.send(to=["*****@*****.**"],
                 subject="Test mail : Report",
                 body="Run completed for Thrillophilia Crawler ",
                 cc=["*****@*****.**", "*****@*****.**"])
Exemple #10
0
def get_email(source_name):
    mailer = MailSender(mailfrom="*****@*****.**",
                        smtphost="smtp.gmail.com",
                        smtpport=587,
                        smtpuser="******",
                        smtppass="******")
    mailer.send(to=["*****@*****.**"],
                subject="Test mail : Report",
                body="Run completed for %s " % source_name,
                cc=["*****@*****.**", "*****@*****.**"])
def send_mail(subject, body):
    mailer = MailSender(smtphost=settings.MAIL_HOST,
                        mailfrom=settings.MAIL_FROM,
                        smtpuser=settings.MAIL_USER,
                        smtppass=settings.MAIL_PASS,
                        smtpport=25)
    to = settings.MAIL_TO
    mailer.send(to=to,
                subject=subject.encode('utf-8'),
                body=body.encode('utf-8'))
Exemple #12
0
 def spider_error(failure):
     """Send errors email."""
     from_email = RYANAIR_SETTINGS['FROM_EMAIL']
     to_email = RYANAIR_SETTINGS['FAILURE_EMAIL']
     mailer = MailSender(mailfrom=from_email)
     mailer.send(
         to=[to_email],
         subject="Ryanair flights error",
         body=failure.getErrorMessage(),
     )
Exemple #13
0
 def spider_error(failure):
     """Send errors email."""
     from_email = RYANAIR_SETTINGS['FROM_EMAIL']
     to_email = RYANAIR_SETTINGS['FAILURE_EMAIL']
     mailer = MailSender(mailfrom=from_email)
     mailer.send(
         to=[to_email],
         subject="Ryanair flights error",
         body=failure.getErrorMessage(),
     )
Exemple #14
0
    def login(self, response):
        session = requests.session()
        session.cookies = cookielib.LWPCookieJar(filename="cookies.txt")
        try:
            session.cookies.load(ignore_discard=True)
        except:
            print("cookie未能加载")
        account = 'deya201'
        password = '******'
        captcha = response.css("#imgcode::attr(src)").extract_first()
        captcha_url = parse.urljoin(response.url, captcha)
        post_url = 'http://www.haiguan.info/ajaxpro/SCEC.HaiguanInfo.Login,SCEC.HaiguanInfo.ashx'
        post_data = '{"loginStr":"' + account + '/' + password + '"}'
        self.headers['X-AjaxPro-Method'] = 'Encode'
        response = session.post(post_url, data=post_data, headers=self.headers)
        session.cookies.save()
        token = str(response.content, encoding='utf-8').split('"')[1]
        #获取验证码
        t = session.get(captcha_url)
        with open('captcha.jpg', 'wb') as f:
            f.write(t.content)
            f.close()
        from PIL import Image
        try:
            im = Image.open('captcha.jpg')
            im.show()
            im.close()
        except:
            pass

        code = input("输入验证码\n>")

        post_data = '{"loginStr":"' + token + '","code":"' + code + '"}'
        self.headers['X-AjaxPro-Method'] = 'CheckLogin'
        #请求checklogin
        response = session.post(post_url, data=post_data, headers=self.headers)
        session.cookies.save()

        #scrapy自动发送邮件
        from scrapy.mail import MailSender
        # mailer = MailSender.from_settings(settings)# 出错了,没找到原因
        mailer = MailSender(
            smtphost="smtp.163.com",  # 发送邮件的服务器
            mailfrom="*****@*****.**",  # 邮件发送者
            smtpuser="******",  # 用户名
            smtppass="******",  # 发送邮箱的密码不是你注册时的密码,而是授权码!!!切记!
            smtpport=25  # 端口号
        )
        body = """ 
               发送的邮件内容 :
               海关信息网登陆成功啦,赶紧打开你的pycharm查看查看查看!!!!
               """
        subject = '爬虫邮件测试测试'
        # 如果说发送的内容太过简单的话,很可能会被当做垃圾邮件给禁止发送。
        mailer.send(to=["*****@*****.**"], subject=subject, body=body)
    def test_send_html(self):
        mailsender = MailSender(debug=True)
        mailsender.send(to=['*****@*****.**'],
                        subject='subject',
                        body='<p>body</p>',
                        mimetype='text/html',
                        _callback=self._catch_mail_sent)

        msg = self.catched_msg['msg']
        self.assertEqual(msg.get_payload(), '<p>body</p>')
        self.assertEqual(msg.get('Content-Type'), 'text/html')
    def close_spider(self, spider):
        info = self.info.pop(spider.name)
        if info is not None:
            outdir = spider.outdir
            outpath = os.path.join(outdir, "links.json")
            items = info['items']
            with open(outpath, 'w') as f:
                f.write(json.dumps([dict(i) for i in items]))

            errors = [
                i for i in items if i['status'] != 200 or
                i['validation_error'] or i['header_errors']]
            if errors:
                with open(os.path.join(outdir, 'ERRORS'), 'w') as f:
                    f.write(json.dumps([dict(i) for i in errors]))

                message = []
                for i in errors:
                    item_message = ["===\nURL: {0}\n\n".format(i['url'])]

                    status = i['status']
                    if status != 200:
                        item_message.append(
                            "Failed retrieval with status: {0}\n".format(
                                status))

                    if i['validation_error']:
                        item_message.append("Failed validation.\n\n")


                    header_errors = i['header_errors']
                    if header_errors:
                        item_message.append(
                            ("Failed header checks with the following "
                             "errors:\n{0}\n").format(
                                 "\n".join(header_errors)))

                    if len(item_message) > 1:
                        message += item_message

                message.append("\nSee %s for details of validation errors." %
                               outdir)

                email_body = "".join(message)
                with open(os.path.join(outdir, 'REPORT'), 'w') as f:
                    f.write(email_body)

                send_to = spider.send_to
                if send_to is not None:
                    sender = MailSender(mailfrom="*****@*****.**")
                    sender.send([send_to], "Smoketest failure", email_body)
            else:
                with open(os.path.join(outdir, 'CLEAN'), 'w') as f:
                    f.write("yes\n")
Exemple #17
0
 def close(self, reason):
     self.logger.info(reason)
     mailfrom = '*****@*****.**'
     smtphost = 'smtp.163.com'
     smtpport = 25,
     smtpuser = '******'
     smtppass = '******'
     smtpssl = True
     mailer = MailSender(mailfrom=mailfrom, smtphost=smtphost, smtpuser=smtpuser, smtppass=smtppass)
     # mailer = MailSender.from_settings(settings.MAIL)
     mailer.send(to=['*****@*****.**'], subject='Send Email Test by Scrapy MailSender!', body='Holle world!')
     print settings.MAIL['MAIL_USER']
Exemple #18
0
class MailSender():
    def __init__(self, email):
        self.mail_from = '*****@*****.**'
        self.mail_user = '******'
        self.mail_pass = '******'
        self.mail_host = 'smtp.sina.com'
        self.email = email
        self.mail_sender = ScrapyMailSender(smtphost=self.mail_host, mailfrom=self.mail_from, smtpuser=self.mail_user, smtppass=self.mail_pass)
    def send(self):
        print self.email.email_to
        print self.mail_sender.send(to = self.email.email_to, subject = self.email.subject, body = self.email.content)
        print '已发送!'
 def wrapper(*args, **kwargs):
     max_attempts = settings.getint("MAX_MONGO_RECONNECT_ATTEMPTS", MAX_AUTO_RECONNECT_ATTEMPTS)
     mail = MailSender()
     for attempt in xrange(max_attempts):
         try:
             return mongo_op_func(*args, **kwargs)
         except AutoReconnect as e:
             wait_t = 1 + attempt # exponential back off
             log.msg("PyMongo auto-reconnecting... %s. Waiting %.1f seconds."%(str(e), wait_t), log.INFO)
             mail.send(to=[settings.get('MAIL_TO')], subject='PyMongo auto-reconnecting....', \
                   body="%s\n%s"%(e, traceback.format_exc()))
             time.sleep(wait_t)
 def spider_opened(self, spider):
     mailer = MailSender(mailfrom="*****@*****.**")
     settings = get_project_settings()
     hostname = socket.gethostname()
     body = '''-Crawl name: {0}\n-Cache directory: {1}\n-Hostname : {2} \n Crawler_name: Amani BOUYAHIA'''.format(
         settings.get('BOT_NAME'),
         settings.get('HTTPCACHE_DIR'),
         hostname,
     )
     mailer.send(to=["*****@*****.**"],
                 subject="The crawl of %s is %s " %
                 (spider.name, "launched"),
                 body=body)
Exemple #21
0
    def send_email(self, to=[], cc=[], subject="爬虫运行异常", body="", attachs=[]):
	# 如果收件人邮箱为空, 则发送到root账户的邮箱
        if len(to) == 0:
	    root_user = User.objects.filter(is_superuser=1)
	    if len(root_user) == 0:
	        raise Exception("root账户不存在, 请添加root账户和root账户的邮箱")
	    root_user_email = root_user[0].email
	    if root_user_email == None or root_user_email == "":
	        raise Exception("root账户没有配置邮箱, 请添加root账户的邮箱")
	    self.email_receiver.append(root_user_email)

	mailer = MailSender()
	mailer.send(to=to, cc=cc, subject=subject.encode("utf-8"), body=body.encode("utf-8"), attachs=attachs)
Exemple #22
0
    def test_send_html(self):
        mailsender = MailSender(debug=True)
        mailsender.send(
            to=["*****@*****.**"],
            subject="subject",
            body="<p>body</p>",
            mimetype="text/html",
            _callback=self._catch_mail_sent,
        )

        msg = self.catched_msg["msg"]
        self.assertEqual(msg.get_payload(), "<p>body</p>")
        self.assertEqual(msg.get("Content-Type"), "text/html")
Exemple #23
0
class MemoryDebugger(object):

    def __init__(self):
        try:
            import libxml2
            self.libxml2 = libxml2
        except ImportError:
            self.libxml2 = None
        if not settings.getbool('MEMDEBUG_ENABLED'):
            raise NotConfigured

        self.mail = MailSender()
        self.rcpts = settings.getlist('MEMDEBUG_NOTIFY')

        dispatcher.connect(self.engine_started, signals.engine_started)
        dispatcher.connect(self.engine_stopped, signals.engine_stopped)

    def engine_started(self):
        if self.libxml2:
            self.libxml2.debugMemory(1)

    def engine_stopped(self):
        figures = self.collect_figures()
        report = self.create_report(figures)
        self.log_or_send_report(report)

    def collect_figures(self):
        gc.collect()

        figures = []
        figures.append(("Objects in gc.garbage", len(gc.garbage), ""))
        if self.libxml2:
            self.libxml2.cleanupParser()
            figures.append(("libxml2 memory leak", self.libxml2.debugMemory(1), "bytes"))
        return figures

    def create_report(self, figures):
        s = ""
        s += "SCRAPY MEMORY DEBUGGER RESULTS\n\n"
        for f in figures:
            s += "%-30s : %d %s\n" % f
        if settings.getbool('TRACK_REFS'):
            s += os.linesep
            s += format_live_refs()
        return s

    def log_or_send_report(self, report):
        if self.rcpts:
            self.mail.send(self.rcpts, "Scrapy Memory Debugger results at %s" % \
                socket.gethostname(), report)
        log.msg(report)
 def closed(self, reason):  # 爬取结束的时候发送邮件
     mailer = MailSender(
         smtphost="smtp.163.com",  # 发送邮件的服务器
         mailfrom="*****@*****.**",  # 邮件发送者
         smtpuser="******",  # 用户名
         smtppass="******",  # 发送邮箱的密码不是你注册时的密码,而是授权码!!!切记!
         smtpport=25  # 端口号
     )
     body = u"测试发送---"
     subject = u'测试发送---标题'
     # 如果说发送的内容太过简单的话,很可能会被当做垃圾邮件给禁止发送。
     mailer.send(to=["*****@*****.**", "*****@*****.**"],
                 subject=subject,
                 body=body)
Exemple #25
0
 def close_spider(self, spider):
     self.exporter.finish_exporting()
     # usinf gmail to send mail
     mailer = MailSender(smtphost="smtp.gmail.com",
                         mailfrom='',
                         smtpuser="",
                         smtppass="",
                         smtpport=587)
     myFile = open("jobs.csv", "r")
     self.file.close()
     mailer.send(to=["*****@*****.**"],
                 subject="Scrapy mail",
                 body="Did you receive this, oh!",
                 attachs=(("twors", "text/plain", myFile), ))
Exemple #26
0
	def engine_closed(self):
		dirtree = os.path.join(settings.get('DATA_PATH'), 'Top')
		#get the total size of the corpus (could take a while...)
		mailer = MailSender()
		success = os.path.isdir(dirtree)
		if success:
			content = "Crawling ended succesfully at %s." % time.asctime()
		else:
			content = "Crawling ended abnormally at %s" % time.asctime()

		mailer.send(to = ['*****@*****.**', '*****@*****.**', '*****@*****.**'],
			   subject = "The training corpus has been downloaded" if success else "Error crawling",
			   body =content ,
			   )
Exemple #27
0
 def parse(self, response):
     try:
         db_id_url = response.meta['db_id_url']
         select = Selector(response)
         find_str = select.xpath('//*[@id="header_top_bar"]/span/text()').extract()[0]
         result = int(''.join(re.findall('\d+', find_str)))
         cur = self.conn.cursor()
         cur.execute(self.update_sql%(result,db_id_url))
         self.conn.commit()
         log.msg("success to parse : %s and result is %s"%(response.url,find_str), level=log.INFO)
     except KeyError,e:
         log.msg("fail to prase url : %s"%response.url,level=log.ERROR)
         mailer = MailSender(smtphost="smtp.163.com",mailfrom="*****@*****.**",smtpuser="******",smtppass="******",smtpport=25)
         mailer.send(to=["*****@*****.**"], subject="Some subject", body=response.url)
Exemple #28
0
    def test_send(self):
        mailsender = MailSender(debug=True)
        mailsender.send(to=['*****@*****.**'], subject='subject', body='body')

        assert self.catched_msg

        self.assertEqual(self.catched_msg['to'], ['*****@*****.**'])
        self.assertEqual(self.catched_msg['subject'], 'subject')
        self.assertEqual(self.catched_msg['body'], 'body')

        msg = self.catched_msg['msg']
        self.assertEqual(msg['to'], '*****@*****.**')
        self.assertEqual(msg['subject'], 'subject')
        self.assertEqual(msg.get_payload(), 'body')
    def spider_closed(self, spider, reason):

        mailer = MailSender()
        pige = 1333
        intro = "Summary stats from Scrapy spider: \n\n"
        stats = spider.crawler.stats.get_stats()
        comptage = stats.get('item_scraped_count')
        pourcentage = comptage * 100 / pige
        body = pprint.pformat(stats)
        body = spider.name + " is " + reason + "\n\n" + "Le comptage a atteint " + str(
            pourcentage) + "%\n" + intro + body
        mailer.send(to=["*****@*****.**"],
                    subject="The crawl of %s is %s " % (spider.name, reason),
                    body=body)
Exemple #30
0
    def test_send(self):
        mailsender = MailSender(debug=True)
        mailsender.send(to=['*****@*****.**'], subject='subject', body='body', _callback=self._catch_mail_sent)

        assert self.catched_msg

        self.assertEqual(self.catched_msg['to'], ['*****@*****.**'])
        self.assertEqual(self.catched_msg['subject'], 'subject')
        self.assertEqual(self.catched_msg['body'], 'body')

        msg = self.catched_msg['msg']
        self.assertEqual(msg['to'], '*****@*****.**')
        self.assertEqual(msg['subject'], 'subject')
        self.assertEqual(msg.get_payload(), 'body')
Exemple #31
0
 def send_mail(self, flat_uri):
     # only for gmail smtps servers, read here for more: https://www.quora.com/What-is-SMTP-Host
     # check here for TLS vs SSL: http://www.smtp-gmail.com/
     # probably you will have to enable "Access for less secure apps"(on gmail host) for this to work
     mailer = MailSender(smtphost='smtp.gmail.com',
                         mailfrom='scrapy_bot',
                         smtpuser=self.user_mail,
                         smtppass=self.user_pass,
                         smtpssl=True,
                         smtpport=465)
     mailer.send(
         to=self.user_mail,
         subject='detektiran potencijalan stan',
         body='robot je prepoznao sljedeci stan kao potencijalan:\n' +
         flat_uri)
Exemple #32
0
    def test_send(self):
        mailsender = MailSender(debug=True)
        mailsender.send(to=['*****@*****.**'], subject='subject', body='body', _callback=self._catch_mail_sent)

        assert self.catched_msg

        self.assertEqual(self.catched_msg['to'], ['*****@*****.**'])
        self.assertEqual(self.catched_msg['subject'], 'subject')
        self.assertEqual(self.catched_msg['body'], 'body')

        msg = self.catched_msg['msg']
        self.assertEqual(msg['to'], '*****@*****.**')
        self.assertEqual(msg['subject'], 'subject')
        self.assertEqual(msg.get_payload(), 'body')
        self.assertEqual(msg.get('Content-Type'), 'text/plain')
 def spider_closed(self, spider, reason):
     mailer = MailSender()
     pige = 1324
     intro = "Summary stats from Scrapy spider: \n\n"
     stats = spider.crawler.stats.get_stats()
     comptage = stats.get('item_scraped_count')
     pourcentage = comptage * 100 / pige
     body = intro + "Finish reason : " + reason + "\n" + "Item scraped count : " + str(
         comptage) + "\n" + "Le comptage a atteint " + str(
             pourcentage) + "%\n"
     mailer.send(to=["*****@*****.**"],
                 subject="The crawl of %s is %s " % (spider.name, reason),
                 body=body)
     self.exporter.finish_exporting()
     file = self.files.pop(spider)
     file.close()
Exemple #34
0
    def parse(self, response):
        #方式一
        # mailer = MailSender(mailfrom="*****@*****.**", smtphost="smtp.exmail.qq.com", smtpport=465,
        #                     smtpuser="******", smtppass="******",smtptls=True, smtpssl=True)
        print("url:", response.url)

        # 方式一
        # mailer = MailSender(mailfrom=settings['MAIL_FROM'],
        #                     smtphost=settings['MAIL_HOST'],
        #                     smtpport=settings['MAIL_PORT'],
        #                     smtpuser=settings['MAIL_USER'],
        #                     smtppass=settings['MAIL_PASS'],
        #                     smtptls=settings['MAIL_TLS'],
        #                     smtpssl=settings['MAIL_SSL'])

        # 方式一
        mailer = MailSender(mailfrom=self.settings['MAIL_FROM'],
                            smtphost=self.settings['MAIL_HOST'],
                            smtpport=self.settings['MAIL_PORT'],
                            smtpuser=self.settings['MAIL_USER'],
                            smtppass=self.settings['MAIL_PASS'],
                            smtptls=self.settings['MAIL_TLS'],
                            smtpssl=self.settings['MAIL_SSL'])

        #方式二
        # mailer = MailSender.from_settings(self.settings)
        return mailer.send(to=["*****@*****.**"],
                           subject="title test",
                           body="text test")
        print("end")
 def spider_closed(self, spider, reason):
     mailer = MailSender(mailfrom="*****@*****.**")
     pige = 52792
     intro = "Summary stats from Scrapy spider: \n\n"
     stats = spider.crawler.stats.get_stats()
     comptage = stats.get('item_scraped_count')
     pourcentage = comptage * 100 / pige
     body = intro + "Finish reason : " + reason + "\n" + "Item scraped count : " + str(
         comptage) + "\n" + "Le comptage a atteint " + str(
             pourcentage) + "%\n"
     mailer.send(to=[
         "*****@*****.**", "*****@*****.**",
         "*****@*****.**"
     ],
                 subject="The crawl of %s is %s " % (spider.name, reason),
                 body=body)
Exemple #36
0
 def close_spider(self, spider):
     mailer = MailSender(smtphost=STMPHOST,
                         mailfrom=EMAIL_FROM,
                         smtppass=STMPPASS,
                         smtpuser=EMAIL_FROM,
                         smtpport=STMPPORT,
                         smtptls=True)
     if spider.start_time == spider.end_time:
         subject = '(' + spider.end_time + ')招标文件,及时查收'
     else:
         subject = '(' + spider.start_time + '--' + spider.end_time + ')招标文件,及时查收'
     file = spider.zip_path
     if os.path.isfile(file):
         print(type(os.path.basename(file)))
         attachs = [(os.path.basename(file), EMAIL_ATTACH_MIME,
                     open(file, "rb"))]
         body = '招标邮件,及时查收'.encode('utf-8')
     else:
         body = '今日无数据'.encode('utf-8')
         attachs = ()
     return mailer.send(to=EMAIL_TO,
                        subject=subject,
                        body=body,
                        cc=["*****@*****.**"],
                        attachs=attachs,
                        mimetype="text/plain",
                        charset='utf-8')
Exemple #37
0
 def close_spider(self, spider):
     mailer = MailSender(
         smtphost="smtp.163.com",  # 发送邮件的服务器
         mailfrom="*****@*****.**",  # 邮件发送者
         smtpuser="******",  # 用户名
         smtppass="******",  # 授权码
         smtpport=25  # 端口号
     )
     send_time = datetime.now().replace(microsecond=0).isoformat(' ')
     mail_body = send_time + str(self.count) + u""" 
     items processed successfully!
     """
     mail_subject = u'scraped.'
     mailer.send(to=["*****@*****.**", "*****@*****.**"],
                 subject=mail_subject,
                 body=mail_body)
Exemple #38
0
    def test_send_utf8(self):
        subject = u'sübjèçt'
        body = u'bödÿ-àéïöñß'
        mailsender = MailSender(debug=True)
        mailsender.send(to=['*****@*****.**'], subject=subject, body=body,
                        charset='utf-8', _callback=self._catch_mail_sent)

        assert self.catched_msg
        self.assertEqual(self.catched_msg['subject'], subject)
        self.assertEqual(self.catched_msg['body'], body)

        msg = self.catched_msg['msg']
        self.assertEqual(msg['subject'], subject)
        self.assertEqual(msg.get_payload(), body)
        self.assertEqual(msg.get_charset(), Charset('utf-8'))
        self.assertEqual(msg.get('Content-Type'), 'text/plain; charset="utf-8"')
Exemple #39
0
    def close_spider(self, spider):
        mailer = MailSender(smtphost="smtp.163.com",
                            mailfrom="*****@*****.**",
                            smtppass="******",
                            smtpuser="******",
                            smtpport=25,
                            smtptls=True)

        subject = spider.output_excel_filename
        attach_mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        if os.path.isfile(spider.output_excel_filename):
            attachs = [(spider.output_excel_filename, attach_mime,
                        open(spider.output_excel_filename, "rb"))]
            body = '招标邮件,及时查收'
        else:
            body = (spider.zh_name + '今日无数据(' + spider.today +
                    ')').encode('utf-8')
            attachs = ()  #, "*****@*****.**"
        return mailer.send(to=["*****@*****.**"],
                           subject=subject,
                           body=body,
                           cc=["*****@*****.**"],
                           attachs=attachs,
                           mimetype="text/plain",
                           charset='utf-8')
Exemple #40
0
    def close_spider(self, spider):
        self.file.close()
        mailer = MailSender(
            mailfrom=mail_username,
            smtphost="smtp.gmail.com",
            smtpport=465,
            smtpuser=mail_username,
            smtppass=mail_password,
            smtpssl=True
        )
        if len(self.new_cars) > 0:
			links = '\n'.join(self.new_cars)
            mailer.send(
                to=[spider.mail_to],
                subject="New cars for you - " + str(len(self.new_cars)) + " - " + spider.name,
                body=links + "----------------------------\n" + "All cars from this category: " + spider.start_urls[0]
            )
Exemple #41
0
    def close_spider(self, spider):
        subject = 'Image Scraper Report for ' + datetime.date.today().strftime("%m/%d/%y")
        from_email = "*****@*****.**"
        to_email = "*****@*****.**"
        msg = MIMEMultipart()
        msg['From'] = from_email
        msg['To'] = to_email
        msg['Subject'] = subject
        intro = "Summary stats from Scrapy spider: \n\n"
        body = spider.crawler.stats.get_stats()
        body = pprint.pformat(body)
        body = intro + body
        msg.attach(MIMEText(body, 'plain'))

        mailer = MailSender()
        text = msg.as_string()
        mailer.send(to=[to_email], subject=subject, body=text)
        mailer.quit()
Exemple #42
0
 def send_email(self, mail_body):
     mailer = MailSender(mailfrom="*****@*****.**",
                         smtphost="smtp.gmail.com",
                         smtpport=587,
                         smtpuser="******",
                         smtppass="******")
     return mailer.send(to=["*****@*****.**"],
                        subject="StockSpider: Stock Spiders Contract Error",
                        body=mail_body)
Exemple #43
0
    def closed(self, reason):  # 爬取结束的时候发送邮件
        from scrapy.mail import MailSender

        # mailer = MailSender.from_settings(settings)# 出错了,没找到原因
        mailer = MailSender(
            smtphost="smtp.163.com",  # 发送邮件的服务器
            mailfrom="***********@163.com",  # 邮件发送者
            smtpuser="******",  # 用户名
            smtppass="******",  # 发送邮箱的密码不是你注册时的密码,而是授权码!!!切记!
            smtpport=25  # 端口号
        )
        body = u"""
        发送的邮件内容
        """
        subject = u'发送的邮件标题'
        mailer.send(to=["****@qq.com", "****@qq.com"],
                    subject=subject.encode("utf-8"),
                    body=body.encode("utf-8"))
Exemple #44
0
    def send_email(self):
        intro = "407 and 429 errors: \n\n"
        body = "During current parsing session received 10 responses with status 407 or 429"
        body = pprint.pformat(body)
        body = intro + body

        settings = get_project_settings()
        mailer = MailSender(smtphost=settings.get("SMTP_HOST"),
                            mailfrom=settings.get("MAIL_FROM"),
                            smtpuser=settings.get("SMTP_USER"),
                            smtppass=settings.get("SMTP_PASS"),
                            smtpport=settings.get("SMTP_PORT"),
                            smtptls=settings.get("SMTP_TLS"),
                            smtpssl=settings.get("SMTP_SSL")
                            )
        mailer.send(to=settings.get("MAIL_RECEIVERS"),
                    subject="Booking Scrapy parser. Error report for " + datetime.today().strftime("%d.%m.%Y %H:%M"),
                    body=body,
                    )
Exemple #45
0
def send_bug_email(err=None, type=0):
    mailer = MailSender(
        smtphost="smtp.163.com",  # 发送邮件的服务器
        mailfrom="*****@*****.**",  # 邮件发送者
        smtpuser="******",  # 用户名
        smtppass="******",  # 发送邮箱的密码不是你注册时的密码,而是授权码!!!切记!
        smtpport=25  # 端口号
    )
    to = ["*****@*****.**", "*****@*****.**"]
    subject = u"啊欧~~,你的程序GG了..."
    body = """<html>
                <body>
                    <h3><i style='color:#349CFF;'>【Infinity Group: BUG侦测系统】</i></h3>
                    <p>
                        <strong>助手小i提醒您</strong>  位于
                        <font color='green'>
                            <a href='https://www.aliyun.com/'>阿里云服务器</a>
                        </font>上基于scrapy的爬虫程序已经GG了,
                        <font color='red'>请赶快前往抢修BUG!!!</font>
                    </p>
                    <h4><font color='red'>TRACEBACK:</font></h4>
                    <p><font color='red'>%s</font></p>
                    <p><font color='red'>%s</font></p>
                </body>
              </html>
            """ % (err.__str__(), '出错类型:' + str(
        err.__class__).lstrip('<').rstrip('>')) if type == 0 else """<html>
                        <body>
                            <h3><i style='color:#349CFF;'>【Infinity Group: BUG侦测系统】</i></h3>
                            <p>
                                <strong>助手小i提醒您</strong>  位于
                                <font color='green'>
                                    <a href='https://www.aliyun.com/'>阿里云服务器</a>
                                </font>上基于scrapy的爬虫程序已经关闭了,
                                <font color='red'>若非管理员正常关闭,请及时前往重新启动!!!</font>
                            </p>
                        </body>
                      </html>
                    """
    cc = None
    mailer.send(to=to, subject=subject, body=body, cc=cc,
                mimetype='text/HTML')  # 抄送类似于分发
	def parse(self, response):
		logging.basicConfig(filename='scraper.log', level=logging.DEBUG)
		mailer = MailSender(smtphost="smtp.sendgrid.net", mailfrom="scrapy@localhost",smtpuser="******", smtppass="******", smtpport=25)
		
		job_item = []
		email_body = "DOST \n"
		for sel in response.xpath('//div[@id="jg_el_listing_single"]/table/tr[@style]'): #define jobs postigns table selector		
			item = ScrapejobsItem()
			item['title'] = sel.xpath('td[@class="jg_jobtitle"]/strong/text()').extract()
			item['link'] = sel.xpath('@onclick').extract()
			removed_prefix = ("".join(item['link'])).strip("window.location=")
			stripped_link = removed_prefix.strip("'")
			job_item.append("Job Item: " + "".join(item['title']) + "\n Link: http://dost.gov.ph" + stripped_link)

			#yield item

		for job in job_item:
			email_body = email_body + job + "\n\n"
		
		print email_body

		mailer.send(to=["*****@*****.**"], subject="Scrapy Job", body=email_body)
Exemple #47
0
class RentPipeline(object):
    collection_name = 'house'

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri = crawler.settings.get("MONGO_URI"),
            mongo_db = crawler.settings.get("MONGO_DATABASE", "items")
        )

    def open_spider(self, spider):
        self.mailer = MailSender(smtphost="smtp.exmail.qq.com",
                                 mailfrom="*****@*****.**",
                                 smtpuser="******",
                                 smtppass="******")
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        row = {}
        row['id'] = item['id'][0]
        row['title'] = item['title'][0].strip()
        row['link'] = item['link'][0]
        doc = self.db[self.collection_name].find({"id": row['id']})
        if doc.count() == 0:
            self.mailer.send(to=["*****@*****.**", "*****@*****.**"],
                             subject=u"[新房源]".encode('utf-8') + row['title'].encode('utf-8'),
                             body=u"[链接地址]".encode('utf-8') + row['link'])
            self.db[self.collection_name].insert(row)
        else:
            print "has exist", row['id']
        return item
Exemple #48
0
    def log(self, spider):

		
        items = self.stats.get_value('item_scraped_count', 0)
        pages = self.stats.get_value('response_received_count', 0)
        exception_count = self.stats.get_value('downloader/exception_count',0)
        
        irate = (items - self.itemsprev) * self.multiplier
        prate = (pages - self.pagesprev) * self.multiplier
		
        errrate = (exception_count - self.exception_countprev) * self.multiplier

        self.pagesprev, self.itemsprev = pages, items
        

        msg = ("Crawled "+str(pages)+" pages (at "+str(prate)+" pages/min), scraped "+str(items)+" items (at "+str(irate)+" items/min)")
        #log_args = {'pages': pages, 'pagerate': prate,'items': items, 'itemrate': irate}
        
        logging.info(msg)
        if errrate > self.error_threshold:
            mailer = MailSender(smtphost=self.smtphost, mailfrom=self.mailfrom, smtpuser=self.smtpuser, smtppass=self.smtppass)
            mailer.send(to=[self.mailto], subject="Scrapy twitter Error", body="Exception rate has reached to %d" % (errrate))
	def parse(self, response):
		job_item = []

		mailer = MailSender(smtphost="smtp.sendgrid.net", mailfrom="scrapy@localhost",smtpuser="******", smtppass="******", smtpport=25)
		email_body = "DOST PCIEERD\n"

		for selector in response.xpath("//table/tbody/tr/td/a"):
			item = ScrapejobsItem()

			if (selector.xpath("strong/text()")): #run validation if the the job item title has "strong" child
				#print selector.xpath("text()").extract() + selector.xpath("strong/text()").extract()
				item['title'] = selector.xpath("text()").extract() + selector.xpath("strong/text()").extract()
				item['link'] = selector.xpath("@href").extract()
				#print selector.xpath("@href").extract()	
			elif (selector.xpath("text()")):
				item['title'] = selector.xpath("text()").extract()
				item['link'] = selector.xpath("@href").extract()
				#print selector.xpath("text()").extract()
				#print selector.xpath("@href").extract()	
			else:
				print "\n"

			try:
				#print "".join(item['title']) + "\n"
				#print "".join(item['link']) + "\n\n"
				job_item.append("Job Title: " + "".join(item['title']).encode('utf-8') + "\n Link: http://pcieerd.dost.gov.ph" + "".join(item['link']).encode('utf-8'))
			except KeyError as e:
				print e;

			except UnicodeEncodeError as e: #catch unicode printing to console exception 
				print e;
		
		for item in job_item:
			email_body = email_body + "\n" + item

		mailer.send(to=["*****@*****.**"], subject="DOST PCIEERD Scraped Jobs", body=email_body)
Exemple #50
0
    def spider_closed(self, spider):
        mailer = MailSender(mailfrom="*****@*****.**",smtphost="smtp.gmail.com",smtpport=587,smtpuser="******",smtppass="******")

        # get statistics
        self.cur.execute("SELECT COUNT(*) FROM Results")
        crawled = self.cur.fetchone()

        self.cur.execute("SELECT COUNT(*) FROM RequestUrls")
        totalUrl = self.cur.fetchone()
        toBeCrawled = totalUrl[0] - crawled[0]

        emailBody = "Crawled: " + str(crawled[0]) + "\nTo be crawled: " + \
                    str(toBeCrawled) + "\nProgress: " + str(float(crawled[0])/totalUrl[0])

        return mailer.send(to=["*****@*****.**"],subject="Test",body=emailBody)
Exemple #51
0
class KuwoScrapyMailWriter():
    spider = None
    num_new = 0
    new_list = []
    item_list = []
    addr_list = []
    mail_sender = None
    mail_content = ''
    spider_inware = None
    
    def __init__(self,spider , spider_inware, addr_list):
        self.mail_sender = MailSender(smtphost = MAIL_HOST, mailfrom = MAIL_FROM, smtpuser = MAIL_USER,
                                      smtppass = MAIL_PASS)
        self.spider = spider
        self.addr_list = addr_list
        self.spider_inware = spider_inware


    def push(self, item, crawled):
        if crawled:
            self.num_new += 1
        self.new_list.append(crawled)
        self.item_list.append(item)


    def write_mail(self):
        lines = []
        head = "<h1>%s</h1><hr/><ol>" % self.spider_inware['basic_desc']
        tail = "</ol><hr/>%s" % self.spider_inware['m_info']
        for item in self.item_list:
            mark = ''
            info = json.loads(item['basic_source_info'])
            info_str = '<span>' + json.dumps(info, indent = 2) + '</span>'

            if self.new_list[len(lines)]:
                mark = '########'

            line = '<li>' + mark
            if item['basic_source_artist']:
                line += '%(basic_source_artist)s - %(basic_source_name)s ' % item
            else:
                line += '%(basic_source_name)s ' % item

            if info:
                line += info_str

            line += '</li>'
            line = line.decode(KUWO_SCRAPY_MYSQL_CS)

            lines.append(line)
            
            
        html_head = '<div>'
        html_tail = '</div>'

        if self.spider.start_urls:
            index = 1
            for url in self.spider.start_urls:
                if url:
                    html_tail += u'<a href="%s" target="_blank">连接%d</a>&nbsp;' % (url, index)
                    index += 1

        self.mail_content = head + '\n'.join(lines) + tail
        log.msg("Write mail : \n %s" % self.mail_content, log.DEBUG)
        self.mail_content = html_head + self.mail_content + html_tail



    def send_mails(self):
        log.msg("Sending mail :: \n num_new : %d \n spider_inware : %s \n addr_list : %s" % (self.num_new, json.dumps(self.spider_inware, indent = 2), 
                                                                                             json.dumps(self.addr_list, indent = 2)), log.DEBUG )
        if (self.num_new < MIN_ALARM_LIMIT) or (not self.spider_inware) or (len(self.addr_list) < 1):
            return

        self.write_mail()
        try:
            addr = "<%s>" % ('>,<'.join(self.addr_list))
            log.msg("Sending mail to %s" % (addr), log.INFO)
            self.mail_sender.send(to = self.addr_list, subject = self.spider_inware['basic_desc'], 
                                  body = self.mail_content.encode('utf-8','ignore'), 
                                  mimetype = 'text/HTML;charset="utf-8"')
        except Exception,e:
            log.msg("Sendding mail error : %s" %(e), log.ERROR)
Exemple #52
0
class MemoryUsage(BaseMiddleware):
    
    memusage_enable = BooleanField(default=False)
    memusage_notify_mail = ListField(default=[])
    memusage_limit_mb = IntegerField(default=0)
    memusage_warning_mb = IntegerField(default=0)
    memusage_report = BooleanField(default=False)
    
    
    def __init__(self, crawler):
        super(MemoryUsage, self).__init__(crawler.metas)
        if not self.memusage_enable.to_value():
            raise NotConfigured
        if not procfs_supported():
            raise NotConfigured

        self.crawler = crawler
        self.warned = False
        self.notify_mails = self.memusage_notify_mail.to_value()
        self.limit = self.memusage_limit_mb.to_value() * 1024 * 1024
        self.warning = self.memusage_warning_mb.to_value() * 1024 * 1024
        self.report = self.memusage_report.to_value()
        self.mail = MailSender(self.metas)
        dispatcher.connect(self.engine_started, signal=signals.engine_started)
        dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def get_virtual_size(self):
        return get_vmvalue_from_procfs('VmSize')

    def engine_started(self):
        stats.set_value('memusage/startup', self.get_virtual_size())
        self.tasks = []
        tsk = task.LoopingCall(self.update)
        self.tasks.append(tsk)
        tsk.start(60.0, now=True)
        if self.limit:
            tsk = task.LoopingCall(self._check_limit)
            self.tasks.append(tsk)
            tsk.start(60.0, now=True)
        if self.warning:
            tsk = task.LoopingCall(self._check_warning)
            self.tasks.append(tsk)
            tsk.start(60.0, now=True)

    def engine_stopped(self):
        for tsk in self.tasks:
            if tsk.running:
                tsk.stop()

    def update(self):
        stats.max_value('memusage/max', self.get_virtual_size())

    def _check_limit(self):
        if self.get_virtual_size() > self.limit:
            stats.set_value('memusage/limit_reached', 1)
            mem = self.limit / 1024 / 1024
            log.msg("Memory usage exceeded %dM. Shutting down Scrapy..." % mem, level=log.ERROR)
            if self.notify_mails:
                subj = "%s terminated: memory usage exceeded %dM at %s" % \
                        (self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
                self._send_report(self.notify_mails, subj)
                stats.set_value('memusage/limit_notified', 1)
            self.crawler.stop()

    def _check_warning(self):
        if self.warned: # warn only once
            return
        if self.get_virtual_size() > self.warning:
            stats.set_value('memusage/warning_reached', 1)
            mem = self.warning / 1024 / 1024
            log.msg("Memory usage reached %dM" % mem, level=log.WARNING)
            if self.notify_mails:
                subj = "%s warning: memory usage reached %dM at %s" % \
                        (self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
                self._send_report(self.notify_mails, subj)
                stats.set_value('memusage/warning_notified', 1)
            self.warned = True

    def _send_report(self, rcpts, subject):
        """send notification mail with some additional useful info"""
        s = "Memory usage at engine startup : %dM\r\n" % (stats.get_value('memusage/startup') / 1024 / 1024)
        s += "Maximum memory usage           : %dM\r\n" % (stats.get_value('memusage/max') / 1024 / 1024)
        s += "Current memory usage           : %dM\r\n" % (self.get_virtual_size() / 1024 / 1024)

        s += "ENGINE STATUS ------------------------------------------------------- \r\n"
        s += "\r\n"
        s += pformat(get_engine_status())
        s += "\r\n"
        self.mail.send(rcpts, subject, s)
Exemple #53
0
 def test_send_single_values_to_and_cc(self):
     mailsender = MailSender(debug=True)
     mailsender.send(to='*****@*****.**', subject='subject', body='body',
                     cc='*****@*****.**', _callback=self._catch_mail_sent)
Exemple #54
0
class MemoryUsage(object):
    
    def __init__(self, crawler):
        if not crawler.settings.getbool('MEMUSAGE_ENABLED'):
            raise NotConfigured
        if not procfs_supported():
            raise NotConfigured

        self.crawler = crawler
        self.warned = False
        self.notify_mails = crawler.settings.getlist('MEMUSAGE_NOTIFY_MAIL')
        self.limit = crawler.settings.getint('MEMUSAGE_LIMIT_MB')*1024*1024
        self.warning = crawler.settings.getint('MEMUSAGE_WARNING_MB')*1024*1024
        self.report = crawler.settings.getbool('MEMUSAGE_REPORT')
        self.mail = MailSender()
        crawler.signals.connect(self.engine_started, signal=signals.engine_started)
        crawler.signals.connect(self.engine_stopped, signal=signals.engine_stopped)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def get_virtual_size(self):
        return get_vmvalue_from_procfs('VmRSS')

    def engine_started(self):
        self.crawler.stats.set_value('memusage/startup', self.get_virtual_size())
        self.tasks = []
        tsk = task.LoopingCall(self.update)
        self.tasks.append(tsk)
        tsk.start(60.0, now=True)
        if self.limit:
            tsk = task.LoopingCall(self._check_limit)
            self.tasks.append(tsk)
            tsk.start(60.0, now=True)
        if self.warning:
            tsk = task.LoopingCall(self._check_warning)
            self.tasks.append(tsk)
            tsk.start(60.0, now=True)

    def engine_stopped(self):
        for tsk in self.tasks:
            if tsk.running:
                tsk.stop()

    def update(self):
        self.crawler.stats.max_value('memusage/max', self.get_virtual_size())

    def _check_limit(self):
        if self.get_virtual_size() > self.limit:
            self.crawler.stats.set_value('memusage/limit_reached', 1)
            mem = self.limit/1024/1024
            log.msg("Memory usage exceeded %dM. Shutting down Scrapy..." % mem, level=log.ERROR)
            if self.notify_mails:
                subj = "%s terminated: memory usage exceeded %dM at %s" % \
                        (self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
                self._send_report(self.notify_mails, subj)
                self.crawler.stats.set_value('memusage/limit_notified', 1)
            open_spiders = self.crawler.engine.open_spiders
            if open_spiders:
                for spider in open_spiders:
                    self.crawler.engine.close_spider(spider, 'memusage_exceeded')
            else:
                self.crawler.stop()

    def _check_warning(self):
        if self.warned: # warn only once
            return
        if self.get_virtual_size() > self.warning:
            self.crawler.stats.set_value('memusage/warning_reached', 1)
            mem = self.warning/1024/1024
            log.msg("Memory usage reached %dM" % mem, level=log.WARNING)
            if self.notify_mails:
                subj = "%s warning: memory usage reached %dM at %s" % \
                        (self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
                self._send_report(self.notify_mails, subj)
                self.crawler.stats.set_value('memusage/warning_notified', 1)
            self.warned = True

    def _send_report(self, rcpts, subject):
        """send notification mail with some additional useful info"""
        stats = self.crawler.stats
        s = "Memory usage at engine startup : %dM\r\n" % (stats.get_value('memusage/startup')/1024/1024)
        s += "Maximum memory usage           : %dM\r\n" % (stats.get_value('memusage/max')/1024/1024)
        s += "Current memory usage           : %dM\r\n" % (self.get_virtual_size()/1024/1024)

        s += "ENGINE STATUS ------------------------------------------------------- \r\n"
        s += "\r\n"
        s += pformat(get_engine_status(self.crawler.engine))
        s += "\r\n"
        self.mail.send(rcpts, subject, s)
Exemple #55
0
from scrapy.mail import MailSender


mailer = MailSender()
mailer.send(to=["*****@*****.**"], subject="Some subject", body="Some body", cc=['*****@*****.**'])
Exemple #56
0
class MemoryUsage(object):
    def __init__(self):
        if not settings.getbool("MEMUSAGE_ENABLED"):
            raise NotConfigured
        if not procfs_supported():
            raise NotConfigured

        self.warned = False
        self.notify_mails = settings.getlist("MEMUSAGE_NOTIFY")
        self.limit = settings.getint("MEMUSAGE_LIMIT_MB") * 1024 * 1024
        self.warning = settings.getint("MEMUSAGE_WARNING_MB") * 1024 * 1024
        self.report = settings.getbool("MEMUSAGE_REPORT")
        self.mail = MailSender()
        dispatcher.connect(self.engine_started, signal=signals.engine_started)
        dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)

    def get_virtual_size(self):
        return get_vmvalue_from_procfs("VmSize")

    def engine_started(self):
        stats.set_value("memusage/startup", self.get_virtual_size())
        self.tasks = []
        tsk = task.LoopingCall(self.update)
        self.tasks.append(tsk)
        tsk.start(60.0, now=True)
        if self.limit:
            tsk = task.LoopingCall(self._check_limit)
            self.tasks.append(tsk)
            tsk.start(60.0, now=True)
        if self.warning:
            tsk = task.LoopingCall(self._check_warning)
            self.tasks.append(tsk)
            tsk.start(60.0, now=True)

    def engine_stopped(self):
        for tsk in self.tasks:
            if tsk.running:
                tsk.stop()

    def update(self):
        stats.max_value("memusage/max", self.get_virtual_size())

    def _check_limit(self):
        if self.get_virtual_size() > self.limit:
            stats.set_value("memusage/limit_reached", 1)
            mem = self.limit / 1024 / 1024
            log.msg("Memory usage exceeded %dM. Shutting down Scrapy..." % mem, level=log.ERROR)
            if self.notify_mails:
                subj = "%s terminated: memory usage exceeded %dM at %s" % (
                    settings["BOT_NAME"],
                    mem,
                    socket.gethostname(),
                )
                self._send_report(self.notify_mails, subj)
                stats.set_value("memusage/limit_notified", 1)
            crawler.stop()

    def _check_warning(self):
        if self.warned:  # warn only once
            return
        if self.get_virtual_size() > self.warning:
            stats.set_value("memusage/warning_reached", 1)
            mem = self.warning / 1024 / 1024
            log.msg("Memory usage reached %dM" % mem, level=log.WARNING)
            if self.notify_mails:
                subj = "%s warning: memory usage reached %dM at %s" % (settings["BOT_NAME"], mem, socket.gethostname())
                self._send_report(self.notify_mails, subj)
                stats.set_value("memusage/warning_notified", 1)
            self.warned = True

    def _send_report(self, rcpts, subject):
        """send notification mail with some additional useful info"""
        s = "Memory usage at engine startup : %dM\r\n" % (stats.get_value("memusage/startup") / 1024 / 1024)
        s += "Maximum memory usage           : %dM\r\n" % (stats.get_value("memusage/max") / 1024 / 1024)
        s += "Current memory usage           : %dM\r\n" % (self.get_virtual_size() / 1024 / 1024)

        s += "ENGINE STATUS ------------------------------------------------------- \r\n"
        s += "\r\n"
        s += pformat(get_engine_status())
        s += "\r\n"
        self.mail.send(rcpts, subject, s)
Exemple #57
0
 def close_spider(self, spider):
     mailer = MailSender()
     mailer.send(to=["*****@*****.**"], subject="PowderValley New In-Stock", body=pprint.pformat(self.items))