Beispiel #1
0
 def export(self):
     """
     """
     logger.debug("Begin RSS Export:")
     db = CrawlDB()
     rep = Pattern()
     for pat in db.getPatterns():
         pid = pat["pid"]
         pattern = pat["pattern"]
         description = pat["name"]
         items = []
         for page in db.getPages("where pid=%d limit 10" % pid):
             items.append(self.rssitem % (page["url"],
                                          page["title"],
                                          "",
                                          pattern,
                                          "",
                                          page["url"],
                                          rep.sub(page["content"])))
         itemout = "\n".join(items)
         output = self.rssframe % (pattern,
                                   "http://hjbbs.com/bbs",
                                   description,
                                   "Learning English Tool",
                                   itemout)
         logger.debug("LET %d:\n%s\n" % (pid, output))
         # write out
         fp = open("%slet%d.xml" % (config.RSSDIR, pid), "w")
         fp.write(output.encode('utf8'))
         fp.close()
     logger.debug("End RSS Export.")
Beispiel #2
0
                if crawl.loginHjbbs():
                    for link in crawl.parseTitles():
                        page = crawl.crawlPage(link)
                        fp.write(link + "\r\n")
                        fp.write(page["title"])
                        fp.write(page["content"])
                        print link
                else:
                    print "login failed"
            elif o in ("-m", "--mail"):
                from mail import SendMail

                db    = CrawlDB()
                mail = SendMail()
                # search db
                pages = db.getPages()
                if pages:
                    for page in pages:
                        if mail.sendMail(page["title"], page["content"]):
                            db.setUrl(page["url"])
                else:
                    print "no mail is sent"
                mail.close()
            else:
                assert False, "unhandled option"
    else:
        # from time import strftime, gmtime
        # timefmt = strftime("%y-%m-%d", gmtime())
        # print "%s run crawl.crawlPages()" % (timefmt)

        crawl.crawlPages()