class crawl: def __init__(self, urllist='urllist', invertedindex='invertedindex', htmls='htmls'): # 将主网址加入集合 self.urllistName = os.path.join(os.path.dirname(__file__), urllist) self.htmlsName = os.path.join(os.path.dirname(__file__), htmls) self.indexbuilder = IndexBuilder(invertedindex) self.urls = [] self.htmls = [] self.length = [] def process(self): file_names = urllib2.urlopen("http://127.0.0.1:9000/index.txt").read().split() for file_name in file_names: try: url = "http://127.0.0.1:9000/{}".format(file_name) content = urllib2.urlopen(url).read() title = content[:content.find("\n")] self.urls.append(url) self.length.append(self.indexbuilder.process(content, title, len(self.urls) - 1)) self.htmls.append([title, content]) except urllib2.URLError as e: print(type(e), e.message, e.args) print(url) except socket.timeout as e: print(type(e), e.message, e.args) print(url) except Exception as e: print(e) print(url) def save(self): self.indexbuilder.save() json.dump(self.htmls, open(self.htmlsName, 'w')) with open(self.urllistName, 'w') as uu: uu.write('%d\n' % (len(self.urls))) i = 0 print('Writing urllist back into file...') for item in self.urls: try: uu.write('%d %s %d %d %d\n' % (i, item, 0, 0, self.length[i])) i += 1 except: sys.stderr.write( '%d %s\n' % (i, item) + '\n') sys.stderr.write('urls output wrong\n')
class crawl: baseurl='' req_header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} req_timeout = 54 urlqueue=[] urls=[] indegree=[] outdegree=[] length = [] head=[] totalcount=0 count=0 read_web=set() graph = [] def __init__(self,baseurl='http://www.cc98.org',urllist='urllist',queue='queue',invertedindex='invertedindex',graph='graph'):#将主网址加入集合 self.baseurl=baseurl self.queueName = queue self.urllistName = urllist self.graphName = graph if os.path.exists(self.urllistName) and os.path.exists(self.queueName) and os.path.exists(self.graphName): self.indexbuilder = IndexBuilder(invertedindex) self.fillset(self.urllistName, self.queueName, self.graphName) #检查是否继续上次爬取 else: self.indexbuilder = IndexBuilder() pass def user_agent(self, loopnum): #宽度优先遍历网页 if self.urlqueue: url_parent = self.urlqueue.pop(0) url = url_parent[0] parent = url_parent[1] else: url = self.baseurl parent = self.baseurl while self.count < loopnum: try: if(url in self.read_web): try: urlid = self.urls.index(url) except Exception as e: print e try: self.indegree[urlid]+=1 except Exception as e: print e try: self.graph[urlid][self.urls.index(parent)] = 1 except Exception as e: print e,e.args else: self.read_web.add(url) tmpoutdegree=0 print("it's the %d time"%(self.count)) self.count=self.count+1#搜索网页数 req = urllib2.Request(url,None,self.req_header) page = urllib2.urlopen(req,None,self.req_timeout) html = page.read() page.close() soup = BeautifulSoup(html) self.urls.append(url) self.graph.append([]) for i in xrange(len(self.urls)-1): self.graph[i].append(0) self.graph[len(self.urls)-1].append(0) self.graph[len(self.urls)-1].append(0) self.indegree.append(1) self.graph[len(self.graph)-1][self.urls.index(parent)] = 1 self.length.append(self.indexbuilder.process(soup,len(self.urls)-1)) a = soup.find_all(['a']) for i in a: suburl=i.get('href') if(suburl is not None and suburl.find('javascript')==-1): if(suburl.find('http')==-1): suburl=self.baseurl+'/'+suburl if(suburl.find('www.cc98.org')!=-1): # print(suburl) self.urlqueue.append([suburl,url]) suburl='' tmpoutdegree=tmpoutdegree+1 #c=raw_input() self.outdegree.append(tmpoutdegree) time.sleep(0.2) except urllib2.URLError as e: print type(e), e.message, e.args print url except socket.timeout as e: print type(e), e.message, e.args print url except Exception as e: print e print url if(len(self.urlqueue)>0): url_parent = self.urlqueue.pop(0) url = url_parent[0] parent = url_parent[1] #结束了 def save(self): self.indexbuilder.save() with open(self.queueName,'w') as qq: sys.stderr.write('Writing queue back into file...\n') for item in self.urlqueue: try: if(item is not None): qq.write(item[0]+' '+item[1]+'\n') except: sys.stderr.write('queue wrong but things well\n') pass with open(self.urllistName,'w') as uu: uu.write('%d\n'%(len(self.urls))) i=0 print('Writing urllist back into file...') for item in self.urls: try: uu.write('%d %s %d %d %d\n'%(i, item, self.indegree[i], self.outdegree[i], self.length[i])) i+=1 except: sys.stderr.write('%d %s %d %d %d\n'%(i, item, self.indegree[i], self.outdegree[i], self.length[i])+'\n') sys.stderr.write('urls output wrong\n') pass #return html with open(self.graphName,'w') as gg: print('Writing graph back into file...') try: json.dump(self.graph,gg,indent = 1) except Exception as e: sys.stderr.write(repr(e)+'\n') sys.stderr.write('Graph store error\n') with open('graph.txt','w') as file: print('Writing graph.txt back into file...') try: for i in xrange(len(self.graph)): for j in xrange(len(self.graph)): if self.graph[i][j] == 1: file.write(str(i)+' '+str(j)+' ') file.write('\n') except Exception as e: sys.stderr.write(repr(e)+'\n') sys.stderr.write('Graph.txt write error\n') def fillset(self,urllist,queue,graph):#将以前访问过的网站加入set,重新获取queue sys.stderr.write('Reload queue, urllist, graph...') with open(urllist,'r') as FILE: totalcount=FILE.readline() for item in FILE.readlines(): try: (tmpid,tmpurl,tmpind,tmpoud,tmplen)=(item.strip('\n').split(' ')) self.urls.append(tmpurl) self.read_web.add(tmpurl) self.indegree.append(int(tmpind)) self.outdegree.append(int(tmpoud)) self.length.append(int(tmplen)) except Exception as e: sys.stderr.write(repr(e)) sys.stderr.write('read in data error\n') with open(queue,'r') as FILE: for item in FILE.readlines(): try: self.urlqueue.append(item.strip('\n').split(' ')) except Exception as e: sys.stderr.write(repr(e)) sys.stderr.write('read queue in error but well\n') with open(graph,'r') as FILE: try: self.graph = json.load(FILE) except Exception as e: sys.stderr.write(repr(e)) sys.stderr.write('Read graph error\n') sys.stderr.write('[Success]\n')