Python CrawlDB Examples, db.CrawlDB Python Examples

Example #1

0

Show file

File: rsspub.py Project: forestbaker/jqian

 def export(self):
     """
     """
     logger.debug("Begin RSS Export:")
     db = CrawlDB()
     rep = Pattern()
     for pat in db.getPatterns():
         pid = pat["pid"]
         pattern = pat["pattern"]
         description = pat["name"]
         items = []
         for page in db.getPages("where pid=%d limit 10" % pid):
             items.append(self.rssitem % (page["url"],
                                          page["title"],
                                          "",
                                          pattern,
                                          "",
                                          page["url"],
                                          rep.sub(page["content"])))
         itemout = "\n".join(items)
         output = self.rssframe % (pattern,
                                   "http://hjbbs.com/bbs",
                                   description,
                                   "Learning English Tool",
                                   itemout)
         logger.debug("LET %d:\n%s\n" % (pid, output))
         # write out
         fp = open("%slet%d.xml" % (config.RSSDIR, pid), "w")
         fp.write(output.encode('utf8'))
         fp.close()
     logger.debug("End RSS Export.")

Example #2

0

Show file

File: crawlweb.py Project: forestbaker/jqian

 def __init__(self):
     """
     """
     self.db = CrawlDB()
     self.pat = Pattern()
     # self.patterns = config.PATTERNS
     self.titlere = re.compile(config.TITLEPATTERN)
     self.patterns = self.db.getPatterns() # unicode patterns

Example #3

0

Show file

File: couchcrawler.py Project: ralphite/couchmap

def crawl(start_url):
    cdb = CrawlDB(settings.DB_FILE)
    cdb.connect()
    cdb.enqueue([start_url])

    while True:
        url = cdb.dequeue()
        if url is False:
            break
        if cdb.hasCrawled(url):
            continue
        print url

        status = 0
        req = urllib2.Request(str(url))
        req.add_header("User-Agent", "couchmap 0.1")

        request = None

        try:
            request = urllib2.urlopen(req)
        except urllib2.URLError, e:
            continue
        except urllib2.HTTPError, e:
            status = e.code

Example #4

0

Show file

File: crawlweb.py Project: forestbaker/jqian

class CrawlPages:
    """
    """
    def __init__(self):
        """
        """
        self.db = CrawlDB()
        self.pat = Pattern()
        # self.patterns = config.PATTERNS
        self.titlere = re.compile(config.TITLEPATTERN)
        self.patterns = self.db.getPatterns() # unicode patterns

    def parseTitles(self):
        """
        Fetch index page firstly, then search page content and figure out all
        links to be retrieved.
        @return list of dict(link, pattern-id) to be retrieved
        """
        logger.info("root url: " + config.ROOTURL)
        sock = urllib2.urlopen(config.ROOTURL)
        lines = sock.readlines()
        sock.close()

        if config.DEBUG:
            pdb.set_trace()

        logger.info("Index Content: %s" %
                    ("\n".join(lines)).decode("gbk"))

        prelines = []
        for line in lines:
            if len(line) > 10:      # trick, avoid useless matches
                for pat in self.patterns:
                    if line.find(pat["pattern"].encode("gbk")) != -1:
                        prelines.append({"line": line,
                                         "pid": pat["pid"]})

        logger.info("catched lines num: %d " % len(prelines))

        prelinks = []
        for line in prelines:
            mline = self.titlere.search(line["line"])
            if mline:
                # check database
                newurl = "http://www.hjbbs.com/"+mline.group(1)
                if config.DEBUG:
                    pdb.set_trace()

                if not self.db.chkUrl(newurl):
                    prelinks.append({"url": newurl,
                                     "pid": line["pid"]})
        logger.info("links to be crawled num: %d " % len(prelinks))
        return prelinks

    def loginHjbbs(self):
        """
        Login in hjbbs, and keep cookie.
        Call this function before crawl any other pages.
        @return A boolean value to indicate login or failed
        """
        cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
        opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
        urllib2.install_opener(opener)

        tmpfile = "code.bmp"

        vcodebmp = urllib2.urlopen('http://hjbbs.com/GetCode.asp').read()
        vcodefile = open(tmpfile, 'wb')
        vcodefile.write(vcodebmp)
        vcodefile.close()

        vcodenum = getCode(tmpfile)

        postdata=urllib.urlencode({
                'username':config.USERNAME,
                'password':config.PASSWORD,
                'comeurl':'http://hjbbs.com/index.asp',
                'userhidden':'3',
                'submit':'登录',
                'CookieDate':3,
                'SecurityKey':vcodenum
                })
        postheaders = {"User-Agent":"Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11",
                       "Content-Type":"application/x-www-form-urlencoded",
                       "Referer":"http://hjbbs.com/login.asp",
                       "Connection":"keep-alive",
                       "Keep-Alive":115}

        req = urllib2.Request(
            url = "http://hjbbs.com/login.asp?action=chk",
            data = postdata,
            headers = postheaders
            )
        try:
            res = urllib2.urlopen(req)
        except HTTPError, e:
            logger.error("loginHjbbs http failed:" + e.reason)
        except URLError, e:
            logger.error("loginHjbbs url failed:" + e.reason)

Example #5

0

Show file

File: crawlweb.py Project: forestbaker/jqian

        for o, a in opts:
            if o in ("-c", "--crawl"):
                fp = open("/tmp/tmp-page.txt", "wb")
                if crawl.loginHjbbs():
                    for link in crawl.parseTitles():
                        page = crawl.crawlPage(link)
                        fp.write(link + "\r\n")
                        fp.write(page["title"])
                        fp.write(page["content"])
                        print link
                else:
                    print "login failed"
            elif o in ("-m", "--mail"):
                from mail import SendMail

                db    = CrawlDB()
                mail = SendMail()
                # search db
                pages = db.getPages()
                if pages:
                    for page in pages:
                        if mail.sendMail(page["title"], page["content"]):
                            db.setUrl(page["url"])
                else:
                    print "no mail is sent"
                mail.close()
            else:
                assert False, "unhandled option"
    else:
        # from time import strftime, gmtime
        # timefmt = strftime("%y-%m-%d", gmtime())

Example #6

0

Show file

File: mail.py Project: forestbaker/jqian

                      "*****@*****.**",
                      "*****@*****.**"]
                title = u"test title here"
                content = """
Begin of test<br/>
<embed type="application/x-shockwave-flash" src="http://www.odeo.com/flash/audio_player_standard_gray.swf" width="400" height="52" allowScriptAccess="always" wmode="transparent" flashvars="external_url=http://172.29.7.127:8000/static/media/35130a760bfc7e5054526ce94c17004f.mp3" />
Another
 <embed src="http://172.29.7.127:8000/static/media/35130a760bfc7e5054526ce94c17004f.mp3" loop=false autostart=false name="IMG_English" width="300" height="20" /><br/>End of test
"""
                mail.sendMail(to, title, content.decode("utf-8"))
    else:
        # send web pages to mail Subscriber
        logger.info("Begin to send email ...")

        pat = Pattern()
        db = CrawlDB()
        pages = db.getPages()
        if pages:
            logger.debug("Fetched %s pages." % (len(pages)))
            for page in pages:
                addrlist = db.getEmailByPid(page["pid"])
                if addrlist:
                    logger.debug("send mail to %s persons..." % (len(addrlist)))
                    content = pat.sub(page["content"])

                    if mail.sendMail(addrlist,
                                     page["title"],
                                     content):
                        db.setUrl(page["url"])
                        logger.info("Page [%s] is sent to %s\n\n%s\n\n" %
                                    (page["title"], ",".join(addrlist),