def __init__(self,baseurl='http://www.cc98.org',urllist='urllist',queue='queue',invertedindex='invertedindex',graph='graph'):#将主网址加入集合 self.baseurl=baseurl self.queueName = queue self.urllistName = urllist self.graphName = graph if os.path.exists(self.urllistName) and os.path.exists(self.queueName) and os.path.exists(self.graphName): self.indexbuilder = IndexBuilder(invertedindex) self.fillset(self.urllistName, self.queueName, self.graphName) #检查是否继续上次爬取 else: self.indexbuilder = IndexBuilder() pass
def __init__(self, urllist='urllist', invertedindex='invertedindex', htmls='htmls'): # 将主网址加入集合 self.urllistName = os.path.join(os.path.dirname(__file__), urllist) self.htmlsName = os.path.join(os.path.dirname(__file__), htmls) self.indexbuilder = IndexBuilder(invertedindex) self.urls = [] self.htmls = [] self.length = []
def __init__(self): self.__invertedindex = IndexBuilder().index self.parser = Parser() self.parser.normalize('a') self.dictionary = None self.totallength = 0 self.lave = 0 self.roll_index = VocabTree() with open(os.path.join(os.path.dirname(__file__), 'urllist'), 'r') as f1: # 打开文件urllist self.__urlnum = int(f1.readline()) # 总url数目 self.urllist = [] n = 0 while n < self.__urlnum: # 将url信息存入字典中 s = f1.readline() arr = s.split(' ') # urlid = int(arr[0]) #url ID url = arr[1] # url地址 indegree = int(arr[2]) # url入度:用于计算PageRank outdegree = int(arr[3]) # url出度 length_of_texts = int(arr[4]) self.urllist.append( [url, indegree, outdegree, length_of_texts]) n = n + 1 self.totallength += length_of_texts self.lave = self.totallength / self.__urlnum with open(os.path.join(os.path.dirname(__file__), 'htmls'), 'r') as file: self.htmls = json.load(file) # [ # [title, text], # [title, text], # ] with open(os.path.join(os.path.dirname(__file__), 'dictionary'), 'r') as file: self.dictionary = json.load(file) #todo: 轮盘索引 sys.stderr.write('Building roll index...') for word in self.dictionary: for i in range(len(word) + 1): self.roll_index.add_word(word[i:] + '$' + word[:i]) sys.stderr.write('[Success]\n')