Example #1
0
    def __init__(self, name, urlQueue, urlist, Flock, homeUrls, maxPageNums, pages, imagenum, continueRun=[True]):
        """
        pages:  记录下载的网页数目
        """

        self.__name = name
        threading.Thread.__init__(self, name=name)
        # own data
        self.__homeUrls = homeUrls
        self.__urlist = urlist
        self.__urlQueue = urlQueue
        self.Flock = Flock
        self.__curSiteID = [0]  # curSiteID
        self.__temSiteID = -1
        self.__homeurl = None
        self.__pageinfo = None
        self.continueRun = continueRun
        # some information to send to UserFrame ----
        # num of downloaded pages
        self.__maxPageNums = maxPageNums
        # 记录下载的页面数目
        self.pages = pages
        self.imagenum = imagenum
        # ---------------------
        self.urlparser = UrlParser(homeUrls)
        self.htmlparser = HtmlParser(self.urlparser)
        self.htmldb = HtmlDB(self.htmlparser)
        self.imageparser = ImageParser(name)
        self.textfileparser = TextFileParser()
Example #2
0
class Reptile(threading.Thread):
    """
    单个线程
    """

    def __init__(self, name, urlQueue, urlist, Flock, homeUrls, maxPageNums, pages, imagenum, continueRun=[True]):
        """
        pages:  记录下载的网页数目
        """

        self.__name = name
        threading.Thread.__init__(self, name=name)
        # own data
        self.__homeUrls = homeUrls
        self.__urlist = urlist
        self.__urlQueue = urlQueue
        self.Flock = Flock
        self.__curSiteID = [0]  # curSiteID
        self.__temSiteID = -1
        self.__homeurl = None
        self.__pageinfo = None
        self.continueRun = continueRun
        # some information to send to UserFrame ----
        # num of downloaded pages
        self.__maxPageNums = maxPageNums
        # 记录下载的页面数目
        self.pages = pages
        self.imagenum = imagenum
        # ---------------------
        self.urlparser = UrlParser(homeUrls)
        self.htmlparser = HtmlParser(self.urlparser)
        self.htmldb = HtmlDB(self.htmlparser)
        self.imageparser = ImageParser(name)
        self.textfileparser = TextFileParser()

    def requestSource(self, url):
        request = urllib2.Request(url)
        request.add_header("Accept-encoding", "gzip")

        try:
            page = self.opener.open(request, timeout=2)  # 设置超时为2s

            if page.code == 200:
                predata = page.read()
                pdata = StringIO.StringIO(predata)
                gzipper = gzip.GzipFile(fileobj=pdata)

                try:
                    data = gzipper.read()
                except (IOError):
                    data = predata

                length = len(data)

                if length < 300 or length > 3000000:
                    return False
                # begain to parse the page
                return data

            page.close()
        except:
            print "time out"

    def underPageLimit(self):
        """
        是否 某个站点的收录页面超出限制
        """
        _type = self.urlparser.typeDetect(self.__pathinfo.url)[0]
        # 如果 type 为‘’ 表示网页  image/doc表文件
        if _type:
            # 对图片等文件不作计数
            return True

        if self.pages[self.__temSiteID] >= self.__maxPageNums[self.__temSiteID]:
            return False
        return True

    def run(self):
        """ 运行主陈需 """

        self.opener = urllib2.build_opener()

        while self.continueRun[0]:
            try:
                self.Flock.acquire()
                self.__pathinfo = self.__urlQueue.pop()
                self.Flock.release()
            except:
                print "nothing in urlqueue"
                print "droped"
                return
            print ".. get pathinfo", self.__pathinfo.url, self.__name
            # get (siteID, (title, path))

            if not self.__pathinfo:
                """
                如果所有的队列均为空 则退出线程
                """
                print ".. get pathinfo empty"
                # return None
                break

            # self.__curSiteID[0] = pathinfo[0]
            self.__temSiteID = self.__pathinfo.siteID
            self.__temHomeUrl = self.__homeUrls[self.__temSiteID]

            # 判断是否超过限制页数
            if not self.underPageLimit():
                continue

            # print '.. curSite', self.__curSiteID[0]
            # print '.. homeurls', self.__homeUrls
            # print '.. get cursiteid', self.__curSiteID
            # print 'the path is ', pathinfo[1][1]
            source = self.requestSource(self.__pathinfo.url)

            # print source

            if not source:
                print "htmlsource is empty"
                continue

            filetype = self.urlparser.typeDetect(self.__pathinfo.url)
            _type = filetype[0]
            print ".. get file type", filetype, self.__name

            if not _type:
                self.dealHtml(source)
            elif _type == "image":
                self.dealImage(source, filetype[1])
                print "self.imagenum", self.imagenum
                self.imagenum[0] += 1
            elif _type == "doc":
                self.dealDoc()
                self.imagenum[0] += 1
            else:
                print "some unknown type..."

            # 处理源码为xml文件 存储到数据库
            # print '.. start to save html'

        # print '.. ',self.__name, 'quit!'

    def dealHtml(self, source):
        """
        对 html文件 从解析到存储的完整操作
        """
        print ".. get source len", len(source)
        # 过短视为无效
        if len(source) < 300:
            return
        # 判断是否为html源码
        if not self.htmlparser.init(source):
            print ".. source is not html"
            return
        # 开始进行处理
        # 从 urlqueue中取得的url 已经为 绝对地址
        self.pages[self.__temSiteID] += 1
        # 取得links srcs列表
        urlist = self.htmlparser.getLinks()
        urlist += self.htmlparser.getSrcs()
        # save html
        self.Flock.acquire()
        docID = self.htmldb.saveHtml(self.__pathinfo.siteID, self.__pathinfo.title, self.__pathinfo.url, source)
        self.Flock.release()

        self.addNewInQueue(docID, self.__pathinfo.url, urlist)

    def dealImage(self, source, extention):
        """
        对 image文件 从解析到存储的完整操作
        """
        try:
            self.imageparser.deal(source, extention, self.__pathinfo.url, self.__pathinfo.toDocID)
        except:
            return

    def dealDoc(self):
        """
        对 doc文件 从解析到存储的完整操作
        """
        self.textfileparser.deal(self.__pathinfo.title, self.__pathinfo.url, self.__pathinfo.toDocID)

    def addNewInQueue(self, docID, pageStdUrl, urlist):
        """
        直接从html source中提取出path列表
        直接添加到各自的inqueue
        docID:  以及存储的page id
        urlist: html 及 文件地址混合列表
        """
        # 连同图片进行处理
        # 图片也需要进行绝对化和判断是否重复等操作
        # print 'get urlist'
        # for url in urlist:
        # print url[0], url[1]
        for urlInfor in urlist:
            # [title, path]
            # print 'pageStdUrl', pageStdUrl
            stdUrl = self.urlparser.transToStdUrl(pageStdUrl, urlInfor[1])
            # print '.. get STDURL', stdUrl
            siteID = self.urlparser.judgeUrl(pageStdUrl, urlInfor[1])
            _type = self.urlparser.typeDetect(stdUrl)[0]
            # print '.. get SITEID', siteID
            # path = self.urlparser.transPathByStd(stdUrl)
            # print '.. get PATH', path

            if siteID != -1:
                """
                加入队列中
                """
                # if not _type:
                # 正常网页
                if not self.__urlist.find(stdUrl):
                    """
                    urlist 中不重复
                    """
                    print ".. Add in Queue", stdUrl, _type

                    if not _type:
                        # 网页
                        self.Flock.acquire()
                        # siteID toDocID urlinfo
                        self.__urlQueue.append(siteID, -1, (urlInfor[0], stdUrl))
                        self.Flock.release()
                    else:
                        # 图片 及 其他文件
                        self.Flock.acquire()
                        # siteID toDocID urlinfo
                        self.__urlQueue.append(siteID, docID, (urlInfor[0], stdUrl))
                        self.Flock.release()

                """