def __init__(self, name, urlQueue, urlist, Flock, homeUrls, maxPageNums, pages, imagenum, continueRun=[True]): """ pages: 记录下载的网页数目 """ self.__name = name threading.Thread.__init__(self, name=name) # own data self.__homeUrls = homeUrls self.__urlist = urlist self.__urlQueue = urlQueue self.Flock = Flock self.__curSiteID = [0] # curSiteID self.__temSiteID = -1 self.__homeurl = None self.__pageinfo = None self.continueRun = continueRun # some information to send to UserFrame ---- # num of downloaded pages self.__maxPageNums = maxPageNums # 记录下载的页面数目 self.pages = pages self.imagenum = imagenum # --------------------- self.urlparser = UrlParser(homeUrls) self.htmlparser = HtmlParser(self.urlparser) self.htmldb = HtmlDB(self.htmlparser) self.imageparser = ImageParser(name) self.textfileparser = TextFileParser()
def __init__(self, name, urlQueue, urlist, urlInQueue, Flock, homeUrls ,curSiteID = [0], continueRun = [True]): threading.Thread.__init__(self, name = name ) #own data self.__homeUrls = homeUrls self.__urlist = urlist self.__urlQueue = urlQueue self.__urlInQueue = urlInQueue self.__Flock = Flock self.__curSiteID = curSiteID self.__temSiteID = -1 self.__conn = None self.__homeurl = None self.continueRun = continueRun #--------------------- self.urlparser = UrlParser(homeUrls) self.htmlparser = HtmlParser(self.urlparser) self.htmldb = HtmlDB(self.htmlparser)
def __init__(self, name, urlQueue, urlist, Flock, homeUrls, maxPageNums, pages, curSiteID = [0], continueRun = [True]): ''' pages: 记录下载的网页数目 ''' self.__name = name threading.Thread.__init__(self, name = name ) #own data self.__pages = pages self.__homeUrls = homeUrls self.__urlist = urlist self.__urlQueue = urlQueue self.Flock = Flock self.__homeurl = None self.continueRun = continueRun #some information to send to UserFrame ---- #num of downloaded pages self.__maxPageNums = maxPageNums #记录下载的页面数目 #--------------------- self.urlparser = UrlParser(homeUrls) self.htmlparser = HtmlParser(self.urlparser) self.htmldb = HtmlDB(self.htmlparser)
class Reptile(threading.Thread): ''' 单个线程 ''' def __init__(self, name, urlQueue, urlist, Flock, homeUrls, maxPageNums, pages, curSiteID = [0], continueRun = [True]): ''' pages: 记录下载的网页数目 ''' self.__name = name threading.Thread.__init__(self, name = name ) #own data self.__pages = pages self.__homeUrls = homeUrls self.__urlist = urlist self.__urlQueue = urlQueue self.Flock = Flock self.__homeurl = None self.continueRun = continueRun #some information to send to UserFrame ---- #num of downloaded pages self.__maxPageNums = maxPageNums #记录下载的页面数目 #--------------------- self.urlparser = UrlParser(homeUrls) self.htmlparser = HtmlParser(self.urlparser) self.htmldb = HtmlDB(self.htmlparser) def requestSource(self, path): conn = self.conn() print '.. conn',conn conn.request("GET", path) r1 = conn.getresponse() data = r1.read() #需要对data的返回转台进行解析 return data def getPage(self, path): print '>>path to load', path try: r = self.requestSource(path) except: r = None return r def run(self): while True : if not self.continueRun[0]: print self.__name,"stopped!" return #从temSiteID开始 print '.. temSiteID : ', self.__temSiteID pathinfo = self.__urlQueue.pop(self.__curSiteID[0]) #get (siteID, (title, url)) print '.. get pathinfo', pathinfo if not pathinfo: ''' 如果所有的队列均为空 则退出线程 ''' print '.. get pathinfo empty' #return None break self.__temHomeUrl = self.__homeUrls[self.__curSiteID[0]][1] #print '.. get cursiteid', self.__curSiteID #print 'the path is ', pathinfo[1][1] try: htmlsource = self.getPage(pathinfo[1][1]) except: print 'pathinfo bool' continue if not htmlsource: print 'htmlsource is wrong' continue print '.. get htmlsource len', len(htmlsource) #判断是否为html源码 if not self.htmlparser.init(htmlsource) : print '.. source is not html' continue #添加 path 到队列中 pageStdUrl = self.urlparser.transToStdUrl(self.__temHomeUrl, pathinfo[1][1]) self.addNewInQueue(pageStdUrl) #处理源码为xml文件 存储到数据库 print '.. start to save html' self.Flock.acquire() self.htmldb.saveHtml(self.__curSiteID[0], pathinfo[1][0], pageStdUrl, htmlsource) self.Flock.release() print '.. ',self.__name, 'quit!' def addNewInQueue(self, pageStdUrl): ''' 直接从html source中提取出path列表 直接添加到各自的inqueue ''' urlist = self.htmlparser.getLinks() print 'get urlist' for url in urlist: print url[0], url[1] for urlInfor in urlist: #[title, path] #print 'pageStdUrl', pageStdUrl stdUrl = self.urlparser.transToStdUrl(pageStdUrl, urlInfor[1]) #print '.. get STDURL', stdUrl siteId = self.urlparser.judgeUrl(pageStdUrl, urlInfor[1]) #print '.. get SITEID', siteId if siteId != -1 : ''' 加入队列中 ''' if not self.__urlist.find(stdUrl) : ''' urlist 中不重复 ''' print '.. Add in Queue', path self.Flock.acquire() self.__urlQueue.append(siteId, (urlInfor[0] ,stdUrl)) self.Flock.release()
class Reptile(threading.Thread): ''' 单个线程 ''' def __init__(self, name, urlQueue, urlist, urlInQueue, Flock, homeUrls ,curSiteID = [0], continueRun = [True]): threading.Thread.__init__(self, name = name ) #own data self.__homeUrls = homeUrls self.__urlist = urlist self.__urlQueue = urlQueue self.__urlInQueue = urlInQueue self.__Flock = Flock self.__curSiteID = curSiteID self.__temSiteID = -1 self.__conn = None self.__homeurl = None self.continueRun = continueRun #--------------------- self.urlparser = UrlParser(homeUrls) self.htmlparser = HtmlParser(self.urlparser) self.htmldb = HtmlDB(self.htmlparser) @dec def conn(self): ''' DNS缓存 ''' if self.__curSiteID[0] != self.__temSiteID: ''' 更新DNS ''' self.__temSiteID = self.__curSiteID[0] self.__homeurl = self.__homeUrls[self.__temSiteID] netloc = self.urlparser.transNetloc(self.__homeUrls[self.__temSiteID]) print netloc self.__conn = httplib.HTTPConnection(netloc, 80, timeout = 10) return self.__conn @dec def requestSource(self, path): conn = self.conn() conn.request("GET", path) r1 = conn.getresponse() data = r1.read() #需要对data的返回转台进行解析 return data @dec def getPage(self, path): return self.requestSource(path) @dec def run(self): ''' 运行主程序 ''' self.conn() while(True): #外界控制是否继续运行 if not self.continueRun[0]: return newPathInfo = self.__urlQueue.get(timeout=5) pageStdUrl = self.urlparser.transToStdUrl(self.__homeurl, newPathInfo[1]) print 'start download: ',pageStdUrl #[title, path] source = self.getPage(newPathInfo[1]) #判断是否为html源码 if not self.htmlparser.init(source): ''' 图片或其他文件 ''' continue #解析和存储 self.addNewInQueue(pageStdUrl) self.htmldb.saveHtml(newPathInfo[0], pageStdUrl, source) @dec def addNewInQueue(self, pageStdUrl): ''' 直接从html source中提取出path列表 直接添加到各自的inqueue ''' urlist = self.htmlparser.getLinks() for urlInfor in urlist: #[title, path] stdUrl = self.urlparser.transToStdUrl(pageStdUrl, urlInfor[1]) siteId = self.urlparser.judgeUrl(pageStdUrl, urlInfor[1]) path = self.urlparser.transPathByStd(stdUrl) #判断是否为本平台url if siteId != -1: if not self.__urlist.find(path): self.__urlInQueue.put(siteId, urlInfor[1], path)
class Reptile(threading.Thread): """ 单个线程 """ def __init__(self, name, urlQueue, urlist, Flock, homeUrls, maxPageNums, pages, imagenum, continueRun=[True]): """ pages: 记录下载的网页数目 """ self.__name = name threading.Thread.__init__(self, name=name) # own data self.__homeUrls = homeUrls self.__urlist = urlist self.__urlQueue = urlQueue self.Flock = Flock self.__curSiteID = [0] # curSiteID self.__temSiteID = -1 self.__homeurl = None self.__pageinfo = None self.continueRun = continueRun # some information to send to UserFrame ---- # num of downloaded pages self.__maxPageNums = maxPageNums # 记录下载的页面数目 self.pages = pages self.imagenum = imagenum # --------------------- self.urlparser = UrlParser(homeUrls) self.htmlparser = HtmlParser(self.urlparser) self.htmldb = HtmlDB(self.htmlparser) self.imageparser = ImageParser(name) self.textfileparser = TextFileParser() def requestSource(self, url): request = urllib2.Request(url) request.add_header("Accept-encoding", "gzip") try: page = self.opener.open(request, timeout=2) # 设置超时为2s if page.code == 200: predata = page.read() pdata = StringIO.StringIO(predata) gzipper = gzip.GzipFile(fileobj=pdata) try: data = gzipper.read() except (IOError): data = predata length = len(data) if length < 300 or length > 3000000: return False # begain to parse the page return data page.close() except: print "time out" def underPageLimit(self): """ 是否 某个站点的收录页面超出限制 """ _type = self.urlparser.typeDetect(self.__pathinfo.url)[0] # 如果 type 为‘’ 表示网页 image/doc表文件 if _type: # 对图片等文件不作计数 return True if self.pages[self.__temSiteID] >= self.__maxPageNums[self.__temSiteID]: return False return True def run(self): """ 运行主陈需 """ self.opener = urllib2.build_opener() while self.continueRun[0]: try: self.Flock.acquire() self.__pathinfo = self.__urlQueue.pop() self.Flock.release() except: print "nothing in urlqueue" print "droped" return print ".. get pathinfo", self.__pathinfo.url, self.__name # get (siteID, (title, path)) if not self.__pathinfo: """ 如果所有的队列均为空 则退出线程 """ print ".. get pathinfo empty" # return None break # self.__curSiteID[0] = pathinfo[0] self.__temSiteID = self.__pathinfo.siteID self.__temHomeUrl = self.__homeUrls[self.__temSiteID] # 判断是否超过限制页数 if not self.underPageLimit(): continue # print '.. curSite', self.__curSiteID[0] # print '.. homeurls', self.__homeUrls # print '.. get cursiteid', self.__curSiteID # print 'the path is ', pathinfo[1][1] source = self.requestSource(self.__pathinfo.url) # print source if not source: print "htmlsource is empty" continue filetype = self.urlparser.typeDetect(self.__pathinfo.url) _type = filetype[0] print ".. get file type", filetype, self.__name if not _type: self.dealHtml(source) elif _type == "image": self.dealImage(source, filetype[1]) print "self.imagenum", self.imagenum self.imagenum[0] += 1 elif _type == "doc": self.dealDoc() self.imagenum[0] += 1 else: print "some unknown type..." # 处理源码为xml文件 存储到数据库 # print '.. start to save html' # print '.. ',self.__name, 'quit!' def dealHtml(self, source): """ 对 html文件 从解析到存储的完整操作 """ print ".. get source len", len(source) # 过短视为无效 if len(source) < 300: return # 判断是否为html源码 if not self.htmlparser.init(source): print ".. source is not html" return # 开始进行处理 # 从 urlqueue中取得的url 已经为 绝对地址 self.pages[self.__temSiteID] += 1 # 取得links srcs列表 urlist = self.htmlparser.getLinks() urlist += self.htmlparser.getSrcs() # save html self.Flock.acquire() docID = self.htmldb.saveHtml(self.__pathinfo.siteID, self.__pathinfo.title, self.__pathinfo.url, source) self.Flock.release() self.addNewInQueue(docID, self.__pathinfo.url, urlist) def dealImage(self, source, extention): """ 对 image文件 从解析到存储的完整操作 """ try: self.imageparser.deal(source, extention, self.__pathinfo.url, self.__pathinfo.toDocID) except: return def dealDoc(self): """ 对 doc文件 从解析到存储的完整操作 """ self.textfileparser.deal(self.__pathinfo.title, self.__pathinfo.url, self.__pathinfo.toDocID) def addNewInQueue(self, docID, pageStdUrl, urlist): """ 直接从html source中提取出path列表 直接添加到各自的inqueue docID: 以及存储的page id urlist: html 及 文件地址混合列表 """ # 连同图片进行处理 # 图片也需要进行绝对化和判断是否重复等操作 # print 'get urlist' # for url in urlist: # print url[0], url[1] for urlInfor in urlist: # [title, path] # print 'pageStdUrl', pageStdUrl stdUrl = self.urlparser.transToStdUrl(pageStdUrl, urlInfor[1]) # print '.. get STDURL', stdUrl siteID = self.urlparser.judgeUrl(pageStdUrl, urlInfor[1]) _type = self.urlparser.typeDetect(stdUrl)[0] # print '.. get SITEID', siteID # path = self.urlparser.transPathByStd(stdUrl) # print '.. get PATH', path if siteID != -1: """ 加入队列中 """ # if not _type: # 正常网页 if not self.__urlist.find(stdUrl): """ urlist 中不重复 """ print ".. Add in Queue", stdUrl, _type if not _type: # 网页 self.Flock.acquire() # siteID toDocID urlinfo self.__urlQueue.append(siteID, -1, (urlInfor[0], stdUrl)) self.Flock.release() else: # 图片 及 其他文件 self.Flock.acquire() # siteID toDocID urlinfo self.__urlQueue.append(siteID, docID, (urlInfor[0], stdUrl)) self.Flock.release() """