def export(self): """ """ logger.debug("Begin RSS Export:") db = CrawlDB() rep = Pattern() for pat in db.getPatterns(): pid = pat["pid"] pattern = pat["pattern"] description = pat["name"] items = [] for page in db.getPages("where pid=%d limit 10" % pid): items.append(self.rssitem % (page["url"], page["title"], "", pattern, "", page["url"], rep.sub(page["content"]))) itemout = "\n".join(items) output = self.rssframe % (pattern, "http://hjbbs.com/bbs", description, "Learning English Tool", itemout) logger.debug("LET %d:\n%s\n" % (pid, output)) # write out fp = open("%slet%d.xml" % (config.RSSDIR, pid), "w") fp.write(output.encode('utf8')) fp.close() logger.debug("End RSS Export.")
def __init__(self): """ """ self.db = CrawlDB() self.pat = Pattern() # self.patterns = config.PATTERNS self.titlere = re.compile(config.TITLEPATTERN) self.patterns = self.db.getPatterns() # unicode patterns
def crawl(start_url): cdb = CrawlDB(settings.DB_FILE) cdb.connect() cdb.enqueue([start_url]) while True: url = cdb.dequeue() if url is False: break if cdb.hasCrawled(url): continue print url status = 0 req = urllib2.Request(str(url)) req.add_header("User-Agent", "couchmap 0.1") request = None try: request = urllib2.urlopen(req) except urllib2.URLError, e: continue except urllib2.HTTPError, e: status = e.code
class CrawlPages: """ """ def __init__(self): """ """ self.db = CrawlDB() self.pat = Pattern() # self.patterns = config.PATTERNS self.titlere = re.compile(config.TITLEPATTERN) self.patterns = self.db.getPatterns() # unicode patterns def parseTitles(self): """ Fetch index page firstly, then search page content and figure out all links to be retrieved. @return list of dict(link, pattern-id) to be retrieved """ logger.info("root url: " + config.ROOTURL) sock = urllib2.urlopen(config.ROOTURL) lines = sock.readlines() sock.close() if config.DEBUG: pdb.set_trace() logger.info("Index Content: %s" % ("\n".join(lines)).decode("gbk")) prelines = [] for line in lines: if len(line) > 10: # trick, avoid useless matches for pat in self.patterns: if line.find(pat["pattern"].encode("gbk")) != -1: prelines.append({"line": line, "pid": pat["pid"]}) logger.info("catched lines num: %d " % len(prelines)) prelinks = [] for line in prelines: mline = self.titlere.search(line["line"]) if mline: # check database newurl = "http://www.hjbbs.com/"+mline.group(1) if config.DEBUG: pdb.set_trace() if not self.db.chkUrl(newurl): prelinks.append({"url": newurl, "pid": line["pid"]}) logger.info("links to be crawled num: %d " % len(prelinks)) return prelinks def loginHjbbs(self): """ Login in hjbbs, and keep cookie. Call this function before crawl any other pages. @return A boolean value to indicate login or failed """ cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar()) opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) urllib2.install_opener(opener) tmpfile = "code.bmp" vcodebmp = urllib2.urlopen('http://hjbbs.com/GetCode.asp').read() vcodefile = open(tmpfile, 'wb') vcodefile.write(vcodebmp) vcodefile.close() vcodenum = getCode(tmpfile) postdata=urllib.urlencode({ 'username':config.USERNAME, 'password':config.PASSWORD, 'comeurl':'http://hjbbs.com/index.asp', 'userhidden':'3', 'submit':'登录', 'CookieDate':3, 'SecurityKey':vcodenum }) postheaders = {"User-Agent":"Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11", "Content-Type":"application/x-www-form-urlencoded", "Referer":"http://hjbbs.com/login.asp", "Connection":"keep-alive", "Keep-Alive":115} req = urllib2.Request( url = "http://hjbbs.com/login.asp?action=chk", data = postdata, headers = postheaders ) try: res = urllib2.urlopen(req) except HTTPError, e: logger.error("loginHjbbs http failed:" + e.reason) except URLError, e: logger.error("loginHjbbs url failed:" + e.reason)
for o, a in opts: if o in ("-c", "--crawl"): fp = open("/tmp/tmp-page.txt", "wb") if crawl.loginHjbbs(): for link in crawl.parseTitles(): page = crawl.crawlPage(link) fp.write(link + "\r\n") fp.write(page["title"]) fp.write(page["content"]) print link else: print "login failed" elif o in ("-m", "--mail"): from mail import SendMail db = CrawlDB() mail = SendMail() # search db pages = db.getPages() if pages: for page in pages: if mail.sendMail(page["title"], page["content"]): db.setUrl(page["url"]) else: print "no mail is sent" mail.close() else: assert False, "unhandled option" else: # from time import strftime, gmtime # timefmt = strftime("%y-%m-%d", gmtime())
"*****@*****.**", "*****@*****.**"] title = u"test title here" content = """ Begin of test<br/> <embed type="application/x-shockwave-flash" src="http://www.odeo.com/flash/audio_player_standard_gray.swf" width="400" height="52" allowScriptAccess="always" wmode="transparent" flashvars="external_url=http://172.29.7.127:8000/static/media/35130a760bfc7e5054526ce94c17004f.mp3" /> Another <embed src="http://172.29.7.127:8000/static/media/35130a760bfc7e5054526ce94c17004f.mp3" loop=false autostart=false name="IMG_English" width="300" height="20" /><br/>End of test """ mail.sendMail(to, title, content.decode("utf-8")) else: # send web pages to mail Subscriber logger.info("Begin to send email ...") pat = Pattern() db = CrawlDB() pages = db.getPages() if pages: logger.debug("Fetched %s pages." % (len(pages))) for page in pages: addrlist = db.getEmailByPid(page["pid"]) if addrlist: logger.debug("send mail to %s persons..." % (len(addrlist))) content = pat.sub(page["content"]) if mail.sendMail(addrlist, page["title"], content): db.setUrl(page["url"]) logger.info("Page [%s] is sent to %s\n\n%s\n\n" % (page["title"], ",".join(addrlist),