Python Parser.normalize Exemples, parse.Parser.normalize Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : correct.py Projet : SundongCandy/Whatever

                            attach[position[i+1]]+=1
                            i+=1
                        else:
                            notend=False
        else:
            cpl.append(phrase_list)
        return cpl


if __name__ == '__main__':
    cc=correction()
    word=raw_input('输入纠正词: ').decode('utf-8')
    choose=input('输入选择：1.普通查找 2.精确查找\n')
    if choose==1:
        pp=Parser()
        phrase_list=pp.normalize(word)
    else:
        if choose==2:
            i=0
            phrase_list=[]
            while i < len(word):
                phrase_list.append(word[i])
                i+=1
    '''for ii in phrase_list:
        print(ii.encode('utf-8'))'''
    print('________________before correct________________')
    ll=cc.correct(phrase_list)
    if len(ll)==0:
        print('no correct')
    else:
        for item in ll:

Exemple #2

0

Afficher le fichier

class IndexBuilder(object):
    def __init__(self, invertedindex='invertedindex'):
        self.__p = Parser()
        self.index = {}
        self.__urlnum = 0
        self.__fileName = invertedindex
        try:
            with open(invertedindex, 'r') as fin:
                sys.stderr.write('Reading invertedindex in...')
                self.__urlnum = int(fin.readline())
                self.index = json.load(fin)
                # i = 0
                # key = ''
                # val = ''
                # for line in fin:
                #     if i == 0:
                #         key = eval(line)
                #     if i == 1:
                #         val = eval(line)
                #         self.index.update({key:val})
                sys.stderr.write('[Success]' + '\n')
        except IOError as err:
            sys.stderr.write(str(type(err)) + str(err.args) + '\n')
            sys.stderr.write('Will rebuild the Inverted Index' + '\n')

    def process(self, soup, urlid):
        urlid = str(urlid)
        self.__urlnum = self.__urlnum + 1
        text = soup.get_text()
        length = 0
        if text:
            for word in self.__p.normalize(text):
                records = self.index.setdefault(word, [0, {}, 0])[1]
                record = records.setdefault(urlid, [0, 0])
                record[0] = record[0] + 1
                length = length + 1
        try:
            title = soup.title.string
        except:
            title = None
        if title is not None:
            title = unicode(title)[:-16] * 5
        if title:
            for word in self.__p.normalize(title):
                records = self.index.setdefault(word, [0, {}, 0])[1]
                record = records.setdefault(urlid, [0, 0])
                record[0] = record[0] + 1
                length = length + 1

        return length

    def save(self):
        self.__calculateTf_idf()
        try:
            with open(self.__fileName, 'w') as fout:
                sys.stderr.write('Save back to \"invertedindex\"...')
                fout.write(str(self.__urlnum) + '\n')
                json.dump(self.index, fout, indent=1)
                # for key,value in self.index.items():
                #     fout.write(repr(key)+'\n')
                #     fout.write(repr(value)+'\n')
                sys.stderr.write('[Success]' + '\n')

        except IOError as err:
            sys.stderr.write(str(err) + '\n')
            sys.stderr.write('Can not write back to \"invertedindex\"!!!' +
                             '\n')
        return

    def __calculateTf_idf(self):
        sys.stderr.write('Calculating...')
        #tf-idf = (1+log(tf,base))*log(N/df,base)
        base = 10
        for postingList in self.index.itervalues():
            postingList[0] = len(postingList[1])
            df = postingList[0]
            postingList[2] = 0
            for record in postingList[1].itervalues():
                tf = record[0]
                record[1] = (1 + math.log(tf, base)) * math.log(
                    self.__urlnum / float(df), base)
                # sys.stderr.write( 'tf_idf=',record[1]+'\n')
                postingList[2] = postingList[2] + tf
        for i in xrange(self.__urlnum):  #for every urlnum
            i = str(i)
            length = 0
            for postingList in self.index.itervalues(
            ):  #for every term records
                records = postingList[1]
                tf_idf = records.get(i, [0, 0])[1]
                length = length + tf_idf * tf_idf
                # if i in records:
                #     sys.stderr.write( 'tf_idf = ', tf_idf+'\n')
                #     sys.stderr.write( 'changed vector length =',length+'\n')
            length = math.sqrt(length)
            # sys.stderr.write( i, 'vector length =',length+'\n')
            for postingList in self.index.itervalues(
            ):  #for every term records
                records = postingList[1]
                if i in records:
                    record = records[i]
                    record[1] = record[1] / length
        sys.stderr.write('[Success]' + '\n')
        return

    def __str__(self):
        import locale
        res = ''
        for key, val in self.index.items():
            type(key)
            res = res + key.encode(
                locale.getpreferredencoding()) + '    ' + str(
                    val[0]) + '    ' + '{'
            for i in xrange(self.__urlnum):
                res = res + str(i) + ':' + str(val[1].get(i, [0, 0.0]))
            res = res + '}' + '\n'
        return res

    def __repr__(self):
        return str(self)

Exemple #3

0

Afficher le fichier

Fichier : search.py Projet : TonyShield/Whatever

class searcher:
    # IndexBuilder().index
    # ... = Parser()
    # ....normalize(str) #['word','word'...]
    # 定义构造方法
    def __init__(self):
        self.__invertedindex = IndexBuilder().index
        self.pp = Parser()
        self.pp.normalize("a")
        self.pagerank = []
        with open("urllist", "r") as f1:  # 打开文件urllist
            self.__num1 = int(f1.readline())  # 总url数目
            self.urllist = []
            n = 0

            while n < self.__num1:  # 将url信息存入字典中
                s = f1.readline()
                arr = s.split(" ")
                # urlid = int(arr[0])          #url ID
                url = arr[1]  # url地址
                indegree = int(arr[2])  # url入度:用于计算PageRank
                outdegree = int(arr[3])  # url出度
                length_of_texts = int(arr[4])
                self.urllist.append([url, indegree, outdegree, length_of_texts])
                n = n + 1
        with open("pagerank", "r") as file:
            for line in file:
                self.pagerank.append(float(line))

    def search_cos(self, query, pagerank=True):
        querydict_tf = {}
        weight = {}
        scoredict = {}
        length = 0
        heap = []
        urlids = []
        self.querylist = self.pp.normalize(query)
        totaldoc = len(self.urllist)
        for item in self.querylist:
            if item in querydict_tf:
                querydict_tf[item] += 1
            else:
                querydict_tf[item] = 1
        for item in querydict_tf.iterkeys():
            if item in self.__invertedindex:
                weight[item] = (1.0 + math.log10(querydict_tf[item])) * math.log10(
                    1.0 * totaldoc / self.__invertedindex[item][0]
                )
            else:
                weight[item] = 0

        i = 0
        while i < self.__num1:
            score = 0

            for item in weight.iterkeys():
                if item in self.__invertedindex and str(i) in self.__invertedindex[item][1]:
                    score += weight[item] * self.__invertedindex[item][1][str(i)][1]
            if pagerank:
                score *= self.pagerank[i]
            uid = id_score(i, score)
            if uid.score > 0:
                if len(heap) <= 50:
                    heapq.heappush(heap, uid)
                else:
                    heapq.heappushpop(heap, uid)

            i += 1

        # 输出
        while len(heap) > 0:
            tmp = heapq.heappop(heap).urlid
            urlids.append(tmp)
        urlids.reverse()
        return urlids

    # boolean search
    def boolean(self, query):
        query = self.pp.normalize(query)  # 解析query
        # character = []
        # for term in query:
        #     print type(term)
        #     query.append(term)
        character_set = list(set(query))  # 去重

        # 根据term的倒排索引数目排序
        # character_set = []
        # for term in character:
        #     T = (term, len(self.__invertedindex[term][1]))
        #     character_set.append(T)
        # character_set.sort(lambda x, y: cmp(x[1], y[1]))

        # 获取倒排文件索引
        finalindex = self.__invertedindex.get(character_set[0], [0, {}, 0])[1].keys()  # 获得第一个term的倒排文件索引
        for term in character_set:
            if finalindex:
                index = self.__invertedindex.get(term, [0, {}, 0])[1].keys()  # 获得第i个term的倒排文件索引
                finalindex = list(set(finalindex) & set(index))
            else:
                return finalindex

        heap = []
        for url in finalindex:
            score = 0
            for term in character_set:
                score = score + self.__invertedindex.get(term, [0, {}, 0])[1][url][0]
            heap.append(id_score(int(url), score))
        heapq.heapify(heap)

        urlids = []
        while len(heap) > 0:
            tmp = heapq.heappop(heap).urlid
            urlids.append(tmp)
        urlids.reverse()
        return urlids

    def gettitle(url):
        try:
            req_header = {
                "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6"
            }
            req = urllib2.Request(url, None, req_header)
            page = urllib2.urlopen(req, None, 54)
            html = page.read()
            page.close()
            soup = BeautifulSoup(html)
            title = soup.title
            title = title.string
        except Exception as e:
            print e
            title = None
        return title

Exemple #4

0

Afficher le fichier

Fichier : correct.py Projet : SundongCandy/Whatever

                        attach[position[i + 1]] += 1
                        i += 1
                    else:
                        notend = False
        else:
            cpl.append(phrase_list)
        return cpl


if __name__ == '__main__':
    cc = correction()
    word = raw_input('输入纠正词: ').decode('utf-8')
    choose = input('输入选择：1.普通查找 2.精确查找\n')
    if choose == 1:
        pp = Parser()
        phrase_list = pp.normalize(word)
    else:
        if choose == 2:
            i = 0
            phrase_list = []
            while i < len(word):
                phrase_list.append(word[i])
                i += 1
    '''for ii in phrase_list:
        print(ii.encode('utf-8'))'''
    print('________________before correct________________')
    ll = cc.correct(phrase_list)
    if len(ll) == 0:
        print('no correct')
    else:
        for item in ll:

Exemple #5

0

Afficher le fichier

Fichier : correct.py Projet : changlinzhang/Whatever

class correction:

    def __init__(self):
        self.pa=Parser()
        self.pp=PinYin()
        self.pp.load_word()
        with open(os.path.join(os.path.dirname(__file__),'pinyin_dict'),'r') as ff:
            line=ff.readline()
            self.jj_dict=json.loads(line)
            ff.close()


    def correct(self,word,choose):
        if choose==1:
            phrase_list=self.pa.normalize(word)
        else:
            if choose==2:
                i=0
                phrase_list=[]
                while i < len(word):
                    phrase_list.append(word[i])
                    i+=1
        termlist=[]
        flag=False

        newplist=self.recompose(phrase_list)
        '''for item in newplist:
            for item2 in item:
                print(item2.encode('utf-8'))'''
        for nnlist in newplist:
            i=0
            tmp_correct=[]
            correct_num=[]
            for item in nnlist:
                py=self.pp.hanzi2pinyin_split(item,'_')
                tmp=[]
                tmp_correct.append(tmp)
                if(py in self.jj_dict):
                    for item2 in self.jj_dict[py]:
                        tmp_correct[i].append(item2)
                else:
                    tmp_correct[i].append((item,1))
                correct_num.append(0)
                i+=1
            
            length=len(tmp_correct)
            notend=True
            while notend:
                i=0
                tmpstr=''
                score=0
                for j in xrange(0,length):
                    tmps=tmp_correct[j][correct_num[j]][0]
                    tmpstr+=tmps
                    score+=int(tmp_correct[j][correct_num[j]][1])*len(tmps)**7
                termlist.append(tup(tmpstr,score))
                correct_num[0]+=1
                while correct_num[i]>=len(tmp_correct[i]):
                    correct_num[i]=0
                    if i<length-1:
                        correct_num[i+1]+=1
                        i+=1
                    else:
                        notend=False
        
        result_list=self.sscore(termlist)
        comstr=''
        for item in phrase_list:
            comstr+=item
        if result_list[0]==comstr:
            result_list=[]
        else:
            if comstr in result_list:
                result_list.pop(result_list.index(comstr))
        return result_list


    def sscore(self,termlist):
        heap=[]
        result_list=[]
        for item in termlist:
            heap.append(item)
        heapq.heapify(heap)
        while len(heap) > 5:
            
            a=heapq.heappop(heap)
        while len(heap) > 0:
            result_list.append(heapq.heappop(heap).term)
        
        result_list.reverse()
        return result_list

    def recompose(self,phrase_list):
        position=[]
        attach={}
        cpl=[]    #the consequence:list of list 
        i=0       #position of single word
        for item in phrase_list:
            if(len(item)==1):
                position.append(i)
                attach[i]=0
            i+=1

        notend=True
        length=len(position)
        if length>0:
            while notend:
                gap=0
                tmp_list=copy.deepcopy(phrase_list)
                tmp_position=copy.deepcopy(position)
                pi=0
                while pi < len(tmp_position):
                    item2=tmp_position[pi]
                    if(attach[item2]==0):
                        if item2-1-gap>=0:
                            tmp_list[item2-1-gap]+=tmp_list[item2-gap]
                            k=tmp_position.index(item2)
                            tmp_position.pop(k)
                            tmp_list.pop(item2-gap)
                            gap+=1
                        else:
                            pi+=1
                            '''while k < len(tmp_position):
                                tmp_position[k]-=gap
#print(tmp_position[k])
                                attach[tmp_position[k]]=attach[tmp_position[k]+gap]
                                k+=1'''
                    else:
                        if attach[item2]==1:
                            if item2+1-gap<len(tmp_list):
                                tmp_list[item2+1-gap]=tmp_list[item2-gap]+tmp_list[item2+1-gap]
                                k=tmp_position.index(item2)
                                tmp_position.pop(k)
                                tmp_list.pop(item2-gap)
                                gap+=1
                                if item2+1 in tmp_position:
                                    tmp_position.pop(tmp_position.index(item2+1))
                            else:
                                pi+=1
                        else:
                             pi+=1
                        '''while k < len(tmp_position):
                            tmp_position[k]-=gap
                            attach[tmp_position[k]]=attach[tmp_position[k]+gap]
                            k+=1'''
                '''flag=True
                for item3 in tmp_list:
                    if len(item3)==1:
                        flag=False
                if flag:'''
                if tmp_list not in cpl:
                    cpl.append(tmp_list)

                attach[position[0]]+=1 #每次变换一个
                i=0
                while attach[position[i]]>=3:
                        attach[position[i]]=0
                        if i<length-1:
                            attach[position[i+1]]+=1
                            i+=1
                        else:
                            notend=False
        else:
            cpl.append(phrase_list)
        return cpl

Exemple #6

0

Afficher le fichier

Fichier : searchfunc.py Projet : changlinzhang/Whatever

class searcher:
    # 定义构造方法
    def __init__(self):
        self.__invertedindex = IndexBuilder().index
        self.parser = Parser()
        self.parser.normalize('a')
        self.dictionary = None
        self.totallength = 0
        self.lave = 0
        self.roll_index = VocabTree()

        with open(os.path.join(os.path.dirname(__file__), 'urllist'),
                  'r') as f1:  # 打开文件urllist
            self.__urlnum = int(f1.readline())  # 总url数目
            self.urllist = []
            n = 0

            while n < self.__urlnum:  # 将url信息存入字典中
                s = f1.readline()
                arr = s.split(' ')
                # urlid = int(arr[0])          #url ID
                url = arr[1]  # url地址
                indegree = int(arr[2])  # url入度:用于计算PageRank
                outdegree = int(arr[3])  # url出度
                length_of_texts = int(arr[4])
                self.urllist.append(
                    [url, indegree, outdegree, length_of_texts])
                n = n + 1
                self.totallength += length_of_texts
        self.lave = self.totallength / self.__urlnum

        with open(os.path.join(os.path.dirname(__file__), 'htmls'),
                  'r') as file:
            self.htmls = json.load(file)
        # [
        #   [title, text],
        #   [title, text],
        # ]
        with open(os.path.join(os.path.dirname(__file__), 'dictionary'),
                  'r') as file:
            self.dictionary = json.load(file)
        #todo: 轮盘索引
        sys.stderr.write('Building roll index...')
        for word in self.dictionary:
            for i in range(len(word) + 1):
                self.roll_index.add_word(word[i:] + '$' + word[:i])
        sys.stderr.write('[Success]\n')

    def search_cos(self, query, k=50):
        querydict_tf = {}
        weight = {}
        scoredict = {}
        length = 0
        heap = []
        urlids = []
        self.querylist = self.parser.normalize(query)
        totaldoc = len(self.urllist)
        for item in self.querylist:
            if (item in querydict_tf):
                querydict_tf[item] += 1
            else:
                querydict_tf[item] = 1
        for item in querydict_tf.iterkeys():
            if (item in self.__invertedindex):
                weight[item] = (
                    1.0 + math.log10(querydict_tf[item])) * math.log10(
                        1.0 * totaldoc / self.__invertedindex[item][0])
            else:
                weight[item] = 0

        i = 0
        for i in range(self.__urlnum):
            score = 0

            for item in weight.iterkeys():
                if (item in self.__invertedindex
                        and str(i) in self.__invertedindex[item][1]):
                    score += weight[item] * self.__invertedindex[item][1][str(
                        i)][1]
            uid = id_score(i, score)
            if (uid.score > 0):
                heap.append(uid)

        # 输出
        heapq.heapify(heap)
        for i in range(k):
            if heap:
                urlids.append(heapq.heappop(heap).urlid)
        return urlids

    def abstract(self, query, urlid):
        query_list = self.parser.normalize(query)
        result_list = []
        for item in query_list:
            index = self.htmls[urlid][1].lower().find(item)
            if index != -1:
                result_list.append([])
                start = -int(random.random() * 10)
                length = 15 - start * 2
                ll = len(item)
                if index >= -start:
                    i = start
                    a = 0
                    result_list[len(result_list) - 1].append('')
                    while i < start + length and index + i < len(
                            self.htmls[urlid][1]):
                        if i == 0:
                            a += 1
                            result_list[len(result_list) - 1].append('')
                        if i == ll:
                            a += 1
                            result_list[len(result_list) - 1].append('')
                        result_list[len(result_list) -
                                    1][a] += self.htmls[urlid][1][index + i]
                        i += 1
                    if i <= ll:
                        result_list[len(result_list) - 1].append('')
                else:
                    i = 0
                    a = 0
                    result_list[len(result_list) - 1].append('')
                    while i < length and index + i < len(self.htmls[urlid][1]):
                        if i == 0:
                            a = 0
                            result_list[len(result_list) - 1].append('')
                        if i == ll:
                            a = 0
                            result_list[len(result_list) - 1].append('')
                        result_list[len(result_list) -
                                    1][a] += self.htmls[urlid][1][index + i]
                        i += 1
                    if i <= ll:
                        result_list[len(result_list) - 1].append('')
        return result_list

    # boolean search
    def boolean(self, query, k=50):
        def query_to_tree(query):
            text2token = r'AND|OR|NOT|\w+|\(|\)'
            token2tag = {
                'AND': 'AND',
                'OR': 'OR',
                'NOT': 'NOT',
                '(': 'LP',
                ')': 'RP'
            }
            grammar = """
            exp -> orexp
            orexp -> orexp "OR" andexp
            orexp -> andexp
            andexp -> andexp "AND" notexp
            andexp -> andexp notexp
            andexp -> notexp
            notexp -> "NOT" metaexp
            notexp -> metaexp
            metaexp -> "LP" exp "RP"
            metaexp -> "#TERM#"
            """
            token = nltk.regexp_tokenize(query, text2token)
            tags = [token2tag.get(t, '#TERM#') for t in token]
            terms = [
                t for t in token if t not in ['AND', 'OR', 'NOT', '(', ')']
            ]
            parser = nltk.ChartParser(nltk.CFG.fromstring(grammar))
            for tree in parser.parse(tags):
                treestr = str(tree)
                for t in terms:
                    treestr = treestr.replace("#TERM#", t, 1)
                tree = nltk.Tree.fromstring(treestr)
            return tree

        def traversal(tree):
            def dict_or(id_score_dict1, id_score_dict2):
                rval = {}
                for key in set(id_score_dict1.keys()).union(
                        id_score_dict2.keys()):
                    rval[key] = id_score_dict1.get(
                        key, 0) + id_score_dict2.get(key, 0)
                return rval

            def dict_and(id_score_dict1, id_score_dict2):
                rval = {}
                for key in set(id_score_dict1.keys()).intersection(
                        id_score_dict2.keys()):
                    rval[key] = min(id_score_dict1.get(key),
                                    id_score_dict2.get(key))
                return rval

            def dict_not(id_score_dict):
                return {
                    url: 0
                    for url in {str(url)
                                for url in range(len(self.urllist))} -
                    set(id_score_dict.keys())
                }

            def word2dict(word):
                term = self.parser.stem(word)
                return {
                    urlid: tf_idf[0]
                    for urlid, tf_idf in self.__invertedindex.get(
                        term, [0, {}, 0])[1].iteritems()
                }

            if isinstance(tree, str) or isinstance(tree, unicode):
                return word2dict(tree)
            elif len(tree) == 1:
                return traversal(tree[0])
            elif tree.label() == 'orexp':
                assert tree[1] == 'OR'
                return dict_or(traversal(tree[0]), traversal(tree[2]))
            elif tree.label() == 'andexp':
                if tree[1] == 'AND':
                    return dict_and(traversal(tree[0]), traversal(tree[2]))
                else:
                    return dict_and(traversal(tree[0]), traversal(tree[1]))
            elif tree.label() == 'notexp':
                assert tree[0] == 'NOT'
                return dict_not(traversal(tree[1]))
            elif tree.label() == 'metaexp':
                assert tree[0] == 'LP'
                assert tree[2] == 'RP'
                return traversal(tree[1])

        if not self.parser.normalize(query):
            return []
        tree = query_to_tree(query)
        url_score_dict = traversal(tree)
        heap = []
        for url, socre in url_score_dict.iteritems():
            heap.append(id_score(int(url), socre))

        # finalindex = self.__invertedindex.get(character_set[0], [0, {}, 0])[1].keys()  # 获得第一个term的倒排文件索引
        # for term in character_set:
        #     if finalindex:
        #         index = self.__invertedindex.get(term, [0, {}, 0])[1].keys()  # 获得第i个term的倒排文件索引
        #         finalindex = list(set(finalindex) & set(index))
        #     else:
        #         return finalindex
        #
        # heap = []
        # for url in finalindex:
        #     score = 0
        #     for term in character_set:
        #         score = score + self.__invertedindex.get(term, [0, {}, 0])[1][url][0]
        #     heap.append(id_score(int(url), score))

        urlids = []
        heapq.heapify(heap)
        for i in range(k):
            if heap:
                urlids.append(heapq.heappop(heap).urlid)
        return urlids

    def search_rsv(self, query, k=50):
        k1 = 1.5
        k3 = 1.5
        b = 0.75
        querydict_tf = {}
        weight = {}
        scoredict = {}
        length = 0
        heap = []
        urlids = []
        self.querylist = self.parser.normalize(query)
        totaldoc = len(self.urllist)
        for item in self.querylist:
            if (item in querydict_tf):
                querydict_tf[item] += 1
            else:
                querydict_tf[item] = 1

        i = 0
        for i in range(self.__urlnum):
            score = 0
            for item in querydict_tf.iterkeys():
                if (item in self.__invertedindex
                        and str(i) in self.__invertedindex[item][1]):
                    score += math.log10(1.0 * self.__urlnum / self.__invertedindex[item][0]) * (k1 + 1) * \
                             self.__invertedindex[item][1][str(i)][0] / (
                             k1 * ((1 - b) + b * (1.0 * self.urllist[i][3] / self.lave)) +
                             self.__invertedindex[item][1][str(i)][0]) * (k3 + 1) * querydict_tf[item] / (
                             k3 + querydict_tf[item])
            uid = id_score(i, score)
            if (uid.score > 0):
                heap.append(uid)
                # if (len(heap) <= 50):
                #     heapq.heappush(heap, uid)
                # else:
                #     heapq.heappushpop(heap, uid)

        # 输出
        #     while len(heap) > 0:
        #         tmp = heapq.heappop(heap).urlid
        #         urlids.append(tmp)
        #     urlids.reverse()
        #     return urlids
        heapq.heapify(heap)
        for i in range(k):
            if heap:
                urlids.append(heapq.heappop(heap).urlid)
        return urlids

    def lm(
        self,
        query,
        k=50,
    ):
        querydict_tf = {}
        weight = {}
        scoredict = {}
        length = 0
        heap = []
        urlids = []
        lam = 0.8
        self.querylist = self.parser.normalize(query)
        totaldoc = len(self.urllist)
        for item in self.querylist:
            if (item in querydict_tf):
                querydict_tf[item] += 1
            else:
                querydict_tf[item] = 1
        for item in querydict_tf.iterkeys():
            if (item in self.__invertedindex):
                weight[item] = (
                    1.0 + math.log10(querydict_tf[item])) * math.log10(
                        1.0 * totaldoc / self.__invertedindex[item][0])
            else:
                weight[item] = 0

        i = 0
        for i in range(self.__urlnum):
            score = 1

            for item in weight.iterkeys():
                if (item in self.__invertedindex
                        and str(i) in self.__invertedindex[item][1]):
                    a = float(self.__invertedindex[item][1][str(i)]
                              [0]) / self.urllist[i][3]
                    b = float(self.__invertedindex[item][2]) / self.totallength
                    score *= (lam * a + (1 - lam) * b)**weight[item]
                else:
                    score = 0
            uid = id_score(i, score)
            if (uid.score > 0):
                heap.append(uid)

        # 输出
        # while len(heap) > 0:
        #     tmp = heapq.heappop(heap).urlid
        #     urlids.append(tmp)
        # urlids.reverse()
        # return urlids
        heapq.heapify(heap)
        for i in range(k):
            if heap:
                urlids.append(heapq.heappop(heap).urlid)
        return urlids

    def gettitle(self, url):
        return u'in_gettitle'
        # try:
        #     req_header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
        #     req = urllib2.Request(url,None,req_header)
        #     page = urllib2.urlopen(req,None,54)
        #     html = page.read()
        #     page.close()
        #     soup = BeautifulSoup(html, 'lxml')
        #     title = soup.title
        #     title = title.string
        # except Exception as e:
        #     print e
        #     title = None
        # return title
    def word_correct(self, word):
        word = self.parser.normalize(word)
        word_list = []
        term = self.__invertedindex.keys()
        # todo correction
        return word_list

    def wildcard2word(self, wildcard):
        def derolled(word):
            assert '$' in word
            first, second = word.split('$')
            return second + first

        assert '*' in wildcard
        first, second = wildcard.split('*')
        rolled_word = second + '$' + first
        return map(derolled, self.roll_index.find_by_prefix(rolled_word))