Example #1
0
 def export(self):
     """
     """
     logger.debug("Begin RSS Export:")
     db = CrawlDB()
     rep = Pattern()
     for pat in db.getPatterns():
         pid = pat["pid"]
         pattern = pat["pattern"]
         description = pat["name"]
         items = []
         for page in db.getPages("where pid=%d limit 10" % pid):
             items.append(self.rssitem % (page["url"],
                                          page["title"],
                                          "",
                                          pattern,
                                          "",
                                          page["url"],
                                          rep.sub(page["content"])))
         itemout = "\n".join(items)
         output = self.rssframe % (pattern,
                                   "http://hjbbs.com/bbs",
                                   description,
                                   "Learning English Tool",
                                   itemout)
         logger.debug("LET %d:\n%s\n" % (pid, output))
         # write out
         fp = open("%slet%d.xml" % (config.RSSDIR, pid), "w")
         fp.write(output.encode('utf8'))
         fp.close()
     logger.debug("End RSS Export.")
Example #2
0
class CrawlPages:
    """
    """
    def __init__(self):
        """
        """
        self.db = CrawlDB()
        self.pat = Pattern()
        # self.patterns = config.PATTERNS
        self.titlere = re.compile(config.TITLEPATTERN)
        self.patterns = self.db.getPatterns() # unicode patterns

    def parseTitles(self):
        """
        Fetch index page firstly, then search page content and figure out all
        links to be retrieved.
        @return list of dict(link, pattern-id) to be retrieved
        """
        logger.info("root url: " + config.ROOTURL)
        sock = urllib2.urlopen(config.ROOTURL)
        lines = sock.readlines()
        sock.close()

        if config.DEBUG:
            pdb.set_trace()

        logger.info("Index Content: %s" %
                    ("\n".join(lines)).decode("gbk"))

        prelines = []
        for line in lines:
            if len(line) > 10:      # trick, avoid useless matches
                for pat in self.patterns:
                    if line.find(pat["pattern"].encode("gbk")) != -1:
                        prelines.append({"line": line,
                                         "pid": pat["pid"]})

        logger.info("catched lines num: %d " % len(prelines))

        prelinks = []
        for line in prelines:
            mline = self.titlere.search(line["line"])
            if mline:
                # check database
                newurl = "http://www.hjbbs.com/"+mline.group(1)
                if config.DEBUG:
                    pdb.set_trace()

                if not self.db.chkUrl(newurl):
                    prelinks.append({"url": newurl,
                                     "pid": line["pid"]})
        logger.info("links to be crawled num: %d " % len(prelinks))
        return prelinks

    def loginHjbbs(self):
        """
        Login in hjbbs, and keep cookie.
        Call this function before crawl any other pages.
        @return A boolean value to indicate login or failed
        """
        cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
        opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
        urllib2.install_opener(opener)

        tmpfile = "code.bmp"

        vcodebmp = urllib2.urlopen('http://hjbbs.com/GetCode.asp').read()
        vcodefile = open(tmpfile, 'wb')
        vcodefile.write(vcodebmp)
        vcodefile.close()

        vcodenum = getCode(tmpfile)

        postdata=urllib.urlencode({
                'username':config.USERNAME,
                'password':config.PASSWORD,
                'comeurl':'http://hjbbs.com/index.asp',
                'userhidden':'3',
                'submit':'登录',
                'CookieDate':3,
                'SecurityKey':vcodenum
                })
        postheaders = {"User-Agent":"Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11",
                       "Content-Type":"application/x-www-form-urlencoded",
                       "Referer":"http://hjbbs.com/login.asp",
                       "Connection":"keep-alive",
                       "Keep-Alive":115}

        req = urllib2.Request(
            url = "http://hjbbs.com/login.asp?action=chk",
            data = postdata,
            headers = postheaders
            )
        try:
            res = urllib2.urlopen(req)
        except HTTPError, e:
            logger.error("loginHjbbs http failed:" + e.reason)
        except URLError, e:
            logger.error("loginHjbbs url failed:" + e.reason)