Beispiel #1
0
	def __init__(self,baseurl='http://www.cc98.org',urllist='urllist',queue='queue',invertedindex='invertedindex',graph='graph'):#将主网址加入集合
		self.baseurl=baseurl
		self.queueName = queue
		self.urllistName = urllist
		self.graphName = graph
		if os.path.exists(self.urllistName) and os.path.exists(self.queueName) and os.path.exists(self.graphName):
			self.indexbuilder = IndexBuilder(invertedindex)
			self.fillset(self.urllistName, self.queueName, self.graphName) #检查是否继续上次爬取
		else:
			self.indexbuilder = IndexBuilder()
			pass
Beispiel #2
0
 def __init__(self, urllist='urllist', invertedindex='invertedindex',
              htmls='htmls'):  # 将主网址加入集合
     self.urllistName = os.path.join(os.path.dirname(__file__), urllist)
     self.htmlsName = os.path.join(os.path.dirname(__file__), htmls)
     self.indexbuilder = IndexBuilder(invertedindex)
     self.urls = []
     self.htmls = []
     self.length = []
Beispiel #3
0
    def __init__(self):
        self.__invertedindex = IndexBuilder().index
        self.parser = Parser()
        self.parser.normalize('a')
        self.dictionary = None
        self.totallength = 0
        self.lave = 0
        self.roll_index = VocabTree()

        with open(os.path.join(os.path.dirname(__file__), 'urllist'),
                  'r') as f1:  # 打开文件urllist
            self.__urlnum = int(f1.readline())  # 总url数目
            self.urllist = []
            n = 0

            while n < self.__urlnum:  # 将url信息存入字典中
                s = f1.readline()
                arr = s.split(' ')
                # urlid = int(arr[0])          #url ID
                url = arr[1]  # url地址
                indegree = int(arr[2])  # url入度:用于计算PageRank
                outdegree = int(arr[3])  # url出度
                length_of_texts = int(arr[4])
                self.urllist.append(
                    [url, indegree, outdegree, length_of_texts])
                n = n + 1
                self.totallength += length_of_texts
        self.lave = self.totallength / self.__urlnum

        with open(os.path.join(os.path.dirname(__file__), 'htmls'),
                  'r') as file:
            self.htmls = json.load(file)
        # [
        #   [title, text],
        #   [title, text],
        # ]
        with open(os.path.join(os.path.dirname(__file__), 'dictionary'),
                  'r') as file:
            self.dictionary = json.load(file)
        #todo: 轮盘索引
        sys.stderr.write('Building roll index...')
        for word in self.dictionary:
            for i in range(len(word) + 1):
                self.roll_index.add_word(word[i:] + '$' + word[:i])
        sys.stderr.write('[Success]\n')