Beispiel #1
0
 def __init__(self, urllist='urllist', invertedindex='invertedindex',
              htmls='htmls'):  # 将主网址加入集合
     self.urllistName = os.path.join(os.path.dirname(__file__), urllist)
     self.htmlsName = os.path.join(os.path.dirname(__file__), htmls)
     self.indexbuilder = IndexBuilder(invertedindex)
     self.urls = []
     self.htmls = []
     self.length = []
Beispiel #2
0
	def __init__(self,baseurl='http://www.cc98.org',urllist='urllist',queue='queue',invertedindex='invertedindex',graph='graph'):#将主网址加入集合
		self.baseurl=baseurl
		self.queueName = queue
		self.urllistName = urllist
		self.graphName = graph
		if os.path.exists(self.urllistName) and os.path.exists(self.queueName) and os.path.exists(self.graphName):
			self.indexbuilder = IndexBuilder(invertedindex)
			self.fillset(self.urllistName, self.queueName, self.graphName) #检查是否继续上次爬取
		else:
			self.indexbuilder = IndexBuilder()
			pass
Beispiel #3
0
class crawl:

    def __init__(self, urllist='urllist', invertedindex='invertedindex',
                 htmls='htmls'):  # 将主网址加入集合
        self.urllistName = os.path.join(os.path.dirname(__file__), urllist)
        self.htmlsName = os.path.join(os.path.dirname(__file__), htmls)
        self.indexbuilder = IndexBuilder(invertedindex)
        self.urls = []
        self.htmls = []
        self.length = []

    def process(self):
        file_names = urllib2.urlopen("http://127.0.0.1:9000/index.txt").read().split()
        for file_name in file_names:
            try:
                url = "http://127.0.0.1:9000/{}".format(file_name)
                content = urllib2.urlopen(url).read()
                title = content[:content.find("\n")]
                self.urls.append(url)
                self.length.append(self.indexbuilder.process(content, title, len(self.urls) - 1))
                self.htmls.append([title, content])
            except urllib2.URLError as e:
                print(type(e), e.message, e.args)
                print(url)
            except socket.timeout as e:
                print(type(e), e.message, e.args)
                print(url)
            except Exception as e:
                print(e)
                print(url)

    def save(self):
        self.indexbuilder.save()
        json.dump(self.htmls, open(self.htmlsName, 'w'))

        with open(self.urllistName, 'w') as uu:
            uu.write('%d\n' % (len(self.urls)))
            i = 0
            print('Writing urllist back into file...')
            for item in self.urls:
                try:
                    uu.write('%d %s %d %d %d\n' % (i, item, 0, 0, self.length[i]))
                    i += 1
                except:
                    sys.stderr.write(
                        '%d %s\n' % (i, item) + '\n')
                    sys.stderr.write('urls output wrong\n')
Beispiel #4
0
    def __init__(self):
        self.__invertedindex = IndexBuilder().index
        self.parser = Parser()
        self.parser.normalize('a')
        self.dictionary = None
        self.totallength = 0
        self.lave = 0
        self.roll_index = VocabTree()

        with open(os.path.join(os.path.dirname(__file__), 'urllist'),
                  'r') as f1:  # 打开文件urllist
            self.__urlnum = int(f1.readline())  # 总url数目
            self.urllist = []
            n = 0

            while n < self.__urlnum:  # 将url信息存入字典中
                s = f1.readline()
                arr = s.split(' ')
                # urlid = int(arr[0])          #url ID
                url = arr[1]  # url地址
                indegree = int(arr[2])  # url入度:用于计算PageRank
                outdegree = int(arr[3])  # url出度
                length_of_texts = int(arr[4])
                self.urllist.append(
                    [url, indegree, outdegree, length_of_texts])
                n = n + 1
                self.totallength += length_of_texts
        self.lave = self.totallength / self.__urlnum

        with open(os.path.join(os.path.dirname(__file__), 'htmls'),
                  'r') as file:
            self.htmls = json.load(file)
        # [
        #   [title, text],
        #   [title, text],
        # ]
        with open(os.path.join(os.path.dirname(__file__), 'dictionary'),
                  'r') as file:
            self.dictionary = json.load(file)
        #todo: 轮盘索引
        sys.stderr.write('Building roll index...')
        for word in self.dictionary:
            for i in range(len(word) + 1):
                self.roll_index.add_word(word[i:] + '$' + word[:i])
        sys.stderr.write('[Success]\n')
Beispiel #5
0
    def __init__(self):
        self.__invertedindex = IndexBuilder().index
        self.pp = Parser()
        self.pp.normalize("a")
        self.pagerank = []
        with open("urllist", "r") as f1:  # 打开文件urllist
            self.__num1 = int(f1.readline())  # 总url数目
            self.urllist = []
            n = 0

            while n < self.__num1:  # 将url信息存入字典中
                s = f1.readline()
                arr = s.split(" ")
                # urlid = int(arr[0])          #url ID
                url = arr[1]  # url地址
                indegree = int(arr[2])  # url入度:用于计算PageRank
                outdegree = int(arr[3])  # url出度
                length_of_texts = int(arr[4])
                self.urllist.append([url, indegree, outdegree, length_of_texts])
                n = n + 1
        with open("pagerank", "r") as file:
            for line in file:
                self.pagerank.append(float(line))
Beispiel #6
0
class searcher:
    # IndexBuilder().index
    # ... = Parser()
    # ....normalize(str) #['word','word'...]
    # 定义构造方法
    def __init__(self):
        self.__invertedindex = IndexBuilder().index
        self.pp = Parser()
        self.pp.normalize("a")
        self.pagerank = []
        with open("urllist", "r") as f1:  # 打开文件urllist
            self.__num1 = int(f1.readline())  # 总url数目
            self.urllist = []
            n = 0

            while n < self.__num1:  # 将url信息存入字典中
                s = f1.readline()
                arr = s.split(" ")
                # urlid = int(arr[0])          #url ID
                url = arr[1]  # url地址
                indegree = int(arr[2])  # url入度:用于计算PageRank
                outdegree = int(arr[3])  # url出度
                length_of_texts = int(arr[4])
                self.urllist.append([url, indegree, outdegree, length_of_texts])
                n = n + 1
        with open("pagerank", "r") as file:
            for line in file:
                self.pagerank.append(float(line))

    def search_cos(self, query, pagerank=True):
        querydict_tf = {}
        weight = {}
        scoredict = {}
        length = 0
        heap = []
        urlids = []
        self.querylist = self.pp.normalize(query)
        totaldoc = len(self.urllist)
        for item in self.querylist:
            if item in querydict_tf:
                querydict_tf[item] += 1
            else:
                querydict_tf[item] = 1
        for item in querydict_tf.iterkeys():
            if item in self.__invertedindex:
                weight[item] = (1.0 + math.log10(querydict_tf[item])) * math.log10(
                    1.0 * totaldoc / self.__invertedindex[item][0]
                )
            else:
                weight[item] = 0

        i = 0
        while i < self.__num1:
            score = 0

            for item in weight.iterkeys():
                if item in self.__invertedindex and str(i) in self.__invertedindex[item][1]:
                    score += weight[item] * self.__invertedindex[item][1][str(i)][1]
            if pagerank:
                score *= self.pagerank[i]
            uid = id_score(i, score)
            if uid.score > 0:
                if len(heap) <= 50:
                    heapq.heappush(heap, uid)
                else:
                    heapq.heappushpop(heap, uid)

            i += 1

        # 输出
        while len(heap) > 0:
            tmp = heapq.heappop(heap).urlid
            urlids.append(tmp)
        urlids.reverse()
        return urlids

    # boolean search
    def boolean(self, query):
        query = self.pp.normalize(query)  # 解析query
        # character = []
        # for term in query:
        #     print type(term)
        #     query.append(term)
        character_set = list(set(query))  # 去重

        # 根据term的倒排索引数目排序
        # character_set = []
        # for term in character:
        #     T = (term, len(self.__invertedindex[term][1]))
        #     character_set.append(T)
        # character_set.sort(lambda x, y: cmp(x[1], y[1]))

        # 获取倒排文件索引
        finalindex = self.__invertedindex.get(character_set[0], [0, {}, 0])[1].keys()  # 获得第一个term的倒排文件索引
        for term in character_set:
            if finalindex:
                index = self.__invertedindex.get(term, [0, {}, 0])[1].keys()  # 获得第i个term的倒排文件索引
                finalindex = list(set(finalindex) & set(index))
            else:
                return finalindex

        heap = []
        for url in finalindex:
            score = 0
            for term in character_set:
                score = score + self.__invertedindex.get(term, [0, {}, 0])[1][url][0]
            heap.append(id_score(int(url), score))
        heapq.heapify(heap)

        urlids = []
        while len(heap) > 0:
            tmp = heapq.heappop(heap).urlid
            urlids.append(tmp)
        urlids.reverse()
        return urlids

    def gettitle(url):
        try:
            req_header = {
                "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6"
            }
            req = urllib2.Request(url, None, req_header)
            page = urllib2.urlopen(req, None, 54)
            html = page.read()
            page.close()
            soup = BeautifulSoup(html)
            title = soup.title
            title = title.string
        except Exception as e:
            print e
            title = None
        return title
Beispiel #7
0
class crawl:
	baseurl=''
	req_header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
	req_timeout = 54
	urlqueue=[]
	urls=[]
	indegree=[]
	outdegree=[]
	length = []
	head=[]
	totalcount=0
	count=0
	read_web=set()
	graph = []
	
	def __init__(self,baseurl='http://www.cc98.org',urllist='urllist',queue='queue',invertedindex='invertedindex',graph='graph'):#将主网址加入集合
		self.baseurl=baseurl
		self.queueName = queue
		self.urllistName = urllist
		self.graphName = graph
		if os.path.exists(self.urllistName) and os.path.exists(self.queueName) and os.path.exists(self.graphName):
			self.indexbuilder = IndexBuilder(invertedindex)
			self.fillset(self.urllistName, self.queueName, self.graphName) #检查是否继续上次爬取
		else:
			self.indexbuilder = IndexBuilder()
			pass
	
	def user_agent(self, loopnum): #宽度优先遍历网页
		if self.urlqueue:
			url_parent = self.urlqueue.pop(0)
			url = url_parent[0]
			parent = url_parent[1]
		else:
			url = self.baseurl
			parent = self.baseurl
		while self.count < loopnum:
			try:
				if(url in self.read_web):
					try:
						urlid = self.urls.index(url)
					except Exception as e:
						print e
					try:
						self.indegree[urlid]+=1
					except Exception as e:
						print e
					try:
						self.graph[urlid][self.urls.index(parent)] = 1
					except Exception as e:
						print e,e.args
				else:
					self.read_web.add(url)
					tmpoutdegree=0
					print("it's the %d time"%(self.count))
					self.count=self.count+1#搜索网页数
					req = urllib2.Request(url,None,self.req_header)
					page = urllib2.urlopen(req,None,self.req_timeout)
					html = page.read()
					page.close()
					soup = BeautifulSoup(html)			
					self.urls.append(url)
					self.graph.append([])
					for i in xrange(len(self.urls)-1): 
						self.graph[i].append(0)
						self.graph[len(self.urls)-1].append(0)
					self.graph[len(self.urls)-1].append(0)
					self.indegree.append(1)
					self.graph[len(self.graph)-1][self.urls.index(parent)] = 1

					self.length.append(self.indexbuilder.process(soup,len(self.urls)-1))

					a = soup.find_all(['a'])
					for i in a:
						suburl=i.get('href')
						if(suburl is not None and suburl.find('javascript')==-1):
							if(suburl.find('http')==-1):
								suburl=self.baseurl+'/'+suburl
							if(suburl.find('www.cc98.org')!=-1):
								# print(suburl)
								self.urlqueue.append([suburl,url])
								suburl=''
								tmpoutdegree=tmpoutdegree+1
					#c=raw_input()
					self.outdegree.append(tmpoutdegree)
					time.sleep(0.2)
			except urllib2.URLError as e:
				print type(e), e.message, e.args
				print url
			except socket.timeout as e:
				print type(e), e.message, e.args
				print url
			except Exception as e:
				print e
				print url
			if(len(self.urlqueue)>0):
				url_parent = self.urlqueue.pop(0)
				url = url_parent[0]
				parent = url_parent[1]
			#结束了
	def save(self):
		self.indexbuilder.save()
		with open(self.queueName,'w') as qq:
			sys.stderr.write('Writing queue back into file...\n')
			for item in self.urlqueue:
				try:
					if(item is not None):
						qq.write(item[0]+' '+item[1]+'\n')
				except:
					sys.stderr.write('queue wrong but things well\n')
					pass
	
		with open(self.urllistName,'w') as uu:
			uu.write('%d\n'%(len(self.urls)))
			i=0
			print('Writing urllist back into file...')
			for item in self.urls:
				try:
					uu.write('%d %s %d %d %d\n'%(i, item, self.indegree[i], self.outdegree[i], self.length[i]))
					i+=1
				except:
					sys.stderr.write('%d %s %d %d %d\n'%(i, item, self.indegree[i], self.outdegree[i], self.length[i])+'\n')
					sys.stderr.write('urls output wrong\n')
					pass
			#return html
		with open(self.graphName,'w') as gg:
			print('Writing graph back into file...')
			try:
				json.dump(self.graph,gg,indent = 1)
			except Exception as e:
				sys.stderr.write(repr(e)+'\n')
				sys.stderr.write('Graph store error\n')

		with open('graph.txt','w') as file:
			print('Writing graph.txt back into file...')
			try:
				for i in xrange(len(self.graph)):
					for j in xrange(len(self.graph)):
						if self.graph[i][j] == 1:
							file.write(str(i)+' '+str(j)+' ')
					file.write('\n')
			except Exception as e:
				sys.stderr.write(repr(e)+'\n')
				sys.stderr.write('Graph.txt write error\n')

	def fillset(self,urllist,queue,graph):#将以前访问过的网站加入set,重新获取queue
		sys.stderr.write('Reload queue, urllist, graph...')
		with open(urllist,'r') as FILE:
			totalcount=FILE.readline()
			for item in FILE.readlines():
				try:
					(tmpid,tmpurl,tmpind,tmpoud,tmplen)=(item.strip('\n').split(' '))
					self.urls.append(tmpurl)
					self.read_web.add(tmpurl)
					self.indegree.append(int(tmpind))
					self.outdegree.append(int(tmpoud))
					self.length.append(int(tmplen))
				except Exception as e:
					sys.stderr.write(repr(e))
					sys.stderr.write('read in data error\n')

		with open(queue,'r') as FILE:
			for item in FILE.readlines():
				try:
					self.urlqueue.append(item.strip('\n').split(' '))
				except Exception as e:
					sys.stderr.write(repr(e))
					sys.stderr.write('read queue in error but well\n')

		with open(graph,'r') as FILE:
			try:
				self.graph = json.load(FILE)
			except Exception as e:
				sys.stderr.write(repr(e))
				sys.stderr.write('Read graph error\n')

		sys.stderr.write('[Success]\n')
Beispiel #8
0
class searcher:
    # 定义构造方法
    def __init__(self):
        self.__invertedindex = IndexBuilder().index
        self.parser = Parser()
        self.parser.normalize('a')
        self.dictionary = None
        self.totallength = 0
        self.lave = 0
        self.roll_index = VocabTree()

        with open(os.path.join(os.path.dirname(__file__), 'urllist'),
                  'r') as f1:  # 打开文件urllist
            self.__urlnum = int(f1.readline())  # 总url数目
            self.urllist = []
            n = 0

            while n < self.__urlnum:  # 将url信息存入字典中
                s = f1.readline()
                arr = s.split(' ')
                # urlid = int(arr[0])          #url ID
                url = arr[1]  # url地址
                indegree = int(arr[2])  # url入度:用于计算PageRank
                outdegree = int(arr[3])  # url出度
                length_of_texts = int(arr[4])
                self.urllist.append(
                    [url, indegree, outdegree, length_of_texts])
                n = n + 1
                self.totallength += length_of_texts
        self.lave = self.totallength / self.__urlnum

        with open(os.path.join(os.path.dirname(__file__), 'htmls'),
                  'r') as file:
            self.htmls = json.load(file)
        # [
        #   [title, text],
        #   [title, text],
        # ]
        with open(os.path.join(os.path.dirname(__file__), 'dictionary'),
                  'r') as file:
            self.dictionary = json.load(file)
        #todo: 轮盘索引
        sys.stderr.write('Building roll index...')
        for word in self.dictionary:
            for i in range(len(word) + 1):
                self.roll_index.add_word(word[i:] + '$' + word[:i])
        sys.stderr.write('[Success]\n')

    def search_cos(self, query, k=50):
        querydict_tf = {}
        weight = {}
        scoredict = {}
        length = 0
        heap = []
        urlids = []
        self.querylist = self.parser.normalize(query)
        totaldoc = len(self.urllist)
        for item in self.querylist:
            if (item in querydict_tf):
                querydict_tf[item] += 1
            else:
                querydict_tf[item] = 1
        for item in querydict_tf.iterkeys():
            if (item in self.__invertedindex):
                weight[item] = (
                    1.0 + math.log10(querydict_tf[item])) * math.log10(
                        1.0 * totaldoc / self.__invertedindex[item][0])
            else:
                weight[item] = 0

        i = 0
        for i in range(self.__urlnum):
            score = 0

            for item in weight.iterkeys():
                if (item in self.__invertedindex
                        and str(i) in self.__invertedindex[item][1]):
                    score += weight[item] * self.__invertedindex[item][1][str(
                        i)][1]
            uid = id_score(i, score)
            if (uid.score > 0):
                heap.append(uid)

        # 输出
        heapq.heapify(heap)
        for i in range(k):
            if heap:
                urlids.append(heapq.heappop(heap).urlid)
        return urlids

    def abstract(self, query, urlid):
        query_list = self.parser.normalize(query)
        result_list = []
        for item in query_list:
            index = self.htmls[urlid][1].lower().find(item)
            if index != -1:
                result_list.append([])
                start = -int(random.random() * 10)
                length = 15 - start * 2
                ll = len(item)
                if index >= -start:
                    i = start
                    a = 0
                    result_list[len(result_list) - 1].append('')
                    while i < start + length and index + i < len(
                            self.htmls[urlid][1]):
                        if i == 0:
                            a += 1
                            result_list[len(result_list) - 1].append('')
                        if i == ll:
                            a += 1
                            result_list[len(result_list) - 1].append('')
                        result_list[len(result_list) -
                                    1][a] += self.htmls[urlid][1][index + i]
                        i += 1
                    if i <= ll:
                        result_list[len(result_list) - 1].append('')
                else:
                    i = 0
                    a = 0
                    result_list[len(result_list) - 1].append('')
                    while i < length and index + i < len(self.htmls[urlid][1]):
                        if i == 0:
                            a = 0
                            result_list[len(result_list) - 1].append('')
                        if i == ll:
                            a = 0
                            result_list[len(result_list) - 1].append('')
                        result_list[len(result_list) -
                                    1][a] += self.htmls[urlid][1][index + i]
                        i += 1
                    if i <= ll:
                        result_list[len(result_list) - 1].append('')
        return result_list

    # boolean search
    def boolean(self, query, k=50):
        def query_to_tree(query):
            text2token = r'AND|OR|NOT|\w+|\(|\)'
            token2tag = {
                'AND': 'AND',
                'OR': 'OR',
                'NOT': 'NOT',
                '(': 'LP',
                ')': 'RP'
            }
            grammar = """
            exp -> orexp
            orexp -> orexp "OR" andexp
            orexp -> andexp
            andexp -> andexp "AND" notexp
            andexp -> andexp notexp
            andexp -> notexp
            notexp -> "NOT" metaexp
            notexp -> metaexp
            metaexp -> "LP" exp "RP"
            metaexp -> "#TERM#"
            """
            token = nltk.regexp_tokenize(query, text2token)
            tags = [token2tag.get(t, '#TERM#') for t in token]
            terms = [
                t for t in token if t not in ['AND', 'OR', 'NOT', '(', ')']
            ]
            parser = nltk.ChartParser(nltk.CFG.fromstring(grammar))
            for tree in parser.parse(tags):
                treestr = str(tree)
                for t in terms:
                    treestr = treestr.replace("#TERM#", t, 1)
                tree = nltk.Tree.fromstring(treestr)
            return tree

        def traversal(tree):
            def dict_or(id_score_dict1, id_score_dict2):
                rval = {}
                for key in set(id_score_dict1.keys()).union(
                        id_score_dict2.keys()):
                    rval[key] = id_score_dict1.get(
                        key, 0) + id_score_dict2.get(key, 0)
                return rval

            def dict_and(id_score_dict1, id_score_dict2):
                rval = {}
                for key in set(id_score_dict1.keys()).intersection(
                        id_score_dict2.keys()):
                    rval[key] = min(id_score_dict1.get(key),
                                    id_score_dict2.get(key))
                return rval

            def dict_not(id_score_dict):
                return {
                    url: 0
                    for url in {str(url)
                                for url in range(len(self.urllist))} -
                    set(id_score_dict.keys())
                }

            def word2dict(word):
                term = self.parser.stem(word)
                return {
                    urlid: tf_idf[0]
                    for urlid, tf_idf in self.__invertedindex.get(
                        term, [0, {}, 0])[1].iteritems()
                }

            if isinstance(tree, str) or isinstance(tree, unicode):
                return word2dict(tree)
            elif len(tree) == 1:
                return traversal(tree[0])
            elif tree.label() == 'orexp':
                assert tree[1] == 'OR'
                return dict_or(traversal(tree[0]), traversal(tree[2]))
            elif tree.label() == 'andexp':
                if tree[1] == 'AND':
                    return dict_and(traversal(tree[0]), traversal(tree[2]))
                else:
                    return dict_and(traversal(tree[0]), traversal(tree[1]))
            elif tree.label() == 'notexp':
                assert tree[0] == 'NOT'
                return dict_not(traversal(tree[1]))
            elif tree.label() == 'metaexp':
                assert tree[0] == 'LP'
                assert tree[2] == 'RP'
                return traversal(tree[1])

        if not self.parser.normalize(query):
            return []
        tree = query_to_tree(query)
        url_score_dict = traversal(tree)
        heap = []
        for url, socre in url_score_dict.iteritems():
            heap.append(id_score(int(url), socre))

        # finalindex = self.__invertedindex.get(character_set[0], [0, {}, 0])[1].keys()  # 获得第一个term的倒排文件索引
        # for term in character_set:
        #     if finalindex:
        #         index = self.__invertedindex.get(term, [0, {}, 0])[1].keys()  # 获得第i个term的倒排文件索引
        #         finalindex = list(set(finalindex) & set(index))
        #     else:
        #         return finalindex
        #
        # heap = []
        # for url in finalindex:
        #     score = 0
        #     for term in character_set:
        #         score = score + self.__invertedindex.get(term, [0, {}, 0])[1][url][0]
        #     heap.append(id_score(int(url), score))

        urlids = []
        heapq.heapify(heap)
        for i in range(k):
            if heap:
                urlids.append(heapq.heappop(heap).urlid)
        return urlids

    def search_rsv(self, query, k=50):
        k1 = 1.5
        k3 = 1.5
        b = 0.75
        querydict_tf = {}
        weight = {}
        scoredict = {}
        length = 0
        heap = []
        urlids = []
        self.querylist = self.parser.normalize(query)
        totaldoc = len(self.urllist)
        for item in self.querylist:
            if (item in querydict_tf):
                querydict_tf[item] += 1
            else:
                querydict_tf[item] = 1

        i = 0
        for i in range(self.__urlnum):
            score = 0
            for item in querydict_tf.iterkeys():
                if (item in self.__invertedindex
                        and str(i) in self.__invertedindex[item][1]):
                    score += math.log10(1.0 * self.__urlnum / self.__invertedindex[item][0]) * (k1 + 1) * \
                             self.__invertedindex[item][1][str(i)][0] / (
                             k1 * ((1 - b) + b * (1.0 * self.urllist[i][3] / self.lave)) +
                             self.__invertedindex[item][1][str(i)][0]) * (k3 + 1) * querydict_tf[item] / (
                             k3 + querydict_tf[item])
            uid = id_score(i, score)
            if (uid.score > 0):
                heap.append(uid)
                # if (len(heap) <= 50):
                #     heapq.heappush(heap, uid)
                # else:
                #     heapq.heappushpop(heap, uid)

        # 输出
        #     while len(heap) > 0:
        #         tmp = heapq.heappop(heap).urlid
        #         urlids.append(tmp)
        #     urlids.reverse()
        #     return urlids
        heapq.heapify(heap)
        for i in range(k):
            if heap:
                urlids.append(heapq.heappop(heap).urlid)
        return urlids

    def lm(
        self,
        query,
        k=50,
    ):
        querydict_tf = {}
        weight = {}
        scoredict = {}
        length = 0
        heap = []
        urlids = []
        lam = 0.8
        self.querylist = self.parser.normalize(query)
        totaldoc = len(self.urllist)
        for item in self.querylist:
            if (item in querydict_tf):
                querydict_tf[item] += 1
            else:
                querydict_tf[item] = 1
        for item in querydict_tf.iterkeys():
            if (item in self.__invertedindex):
                weight[item] = (
                    1.0 + math.log10(querydict_tf[item])) * math.log10(
                        1.0 * totaldoc / self.__invertedindex[item][0])
            else:
                weight[item] = 0

        i = 0
        for i in range(self.__urlnum):
            score = 1

            for item in weight.iterkeys():
                if (item in self.__invertedindex
                        and str(i) in self.__invertedindex[item][1]):
                    a = float(self.__invertedindex[item][1][str(i)]
                              [0]) / self.urllist[i][3]
                    b = float(self.__invertedindex[item][2]) / self.totallength
                    score *= (lam * a + (1 - lam) * b)**weight[item]
                else:
                    score = 0
            uid = id_score(i, score)
            if (uid.score > 0):
                heap.append(uid)

        # 输出
        # while len(heap) > 0:
        #     tmp = heapq.heappop(heap).urlid
        #     urlids.append(tmp)
        # urlids.reverse()
        # return urlids
        heapq.heapify(heap)
        for i in range(k):
            if heap:
                urlids.append(heapq.heappop(heap).urlid)
        return urlids

    def gettitle(self, url):
        return u'in_gettitle'
        # try:
        #     req_header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
        #     req = urllib2.Request(url,None,req_header)
        #     page = urllib2.urlopen(req,None,54)
        #     html = page.read()
        #     page.close()
        #     soup = BeautifulSoup(html, 'lxml')
        #     title = soup.title
        #     title = title.string
        # except Exception as e:
        #     print e
        #     title = None
        # return title
    def word_correct(self, word):
        word = self.parser.normalize(word)
        word_list = []
        term = self.__invertedindex.keys()
        # todo correction
        return word_list

    def wildcard2word(self, wildcard):
        def derolled(word):
            assert '$' in word
            first, second = word.split('$')
            return second + first

        assert '*' in wildcard
        first, second = wildcard.split('*')
        rolled_word = second + '$' + first
        return map(derolled, self.roll_index.find_by_prefix(rolled_word))