attach[position[i+1]]+=1 i+=1 else: notend=False else: cpl.append(phrase_list) return cpl if __name__ == '__main__': cc=correction() word=raw_input('输入纠正词: ').decode('utf-8') choose=input('输入选择:1.普通查找 2.精确查找\n') if choose==1: pp=Parser() phrase_list=pp.normalize(word) else: if choose==2: i=0 phrase_list=[] while i < len(word): phrase_list.append(word[i]) i+=1 '''for ii in phrase_list: print(ii.encode('utf-8'))''' print('________________before correct________________') ll=cc.correct(phrase_list) if len(ll)==0: print('no correct') else: for item in ll:
class IndexBuilder(object): def __init__(self, invertedindex='invertedindex'): self.__p = Parser() self.index = {} self.__urlnum = 0 self.__fileName = invertedindex try: with open(invertedindex, 'r') as fin: sys.stderr.write('Reading invertedindex in...') self.__urlnum = int(fin.readline()) self.index = json.load(fin) # i = 0 # key = '' # val = '' # for line in fin: # if i == 0: # key = eval(line) # if i == 1: # val = eval(line) # self.index.update({key:val}) sys.stderr.write('[Success]' + '\n') except IOError as err: sys.stderr.write(str(type(err)) + str(err.args) + '\n') sys.stderr.write('Will rebuild the Inverted Index' + '\n') def process(self, soup, urlid): urlid = str(urlid) self.__urlnum = self.__urlnum + 1 text = soup.get_text() length = 0 if text: for word in self.__p.normalize(text): records = self.index.setdefault(word, [0, {}, 0])[1] record = records.setdefault(urlid, [0, 0]) record[0] = record[0] + 1 length = length + 1 try: title = soup.title.string except: title = None if title is not None: title = unicode(title)[:-16] * 5 if title: for word in self.__p.normalize(title): records = self.index.setdefault(word, [0, {}, 0])[1] record = records.setdefault(urlid, [0, 0]) record[0] = record[0] + 1 length = length + 1 return length def save(self): self.__calculateTf_idf() try: with open(self.__fileName, 'w') as fout: sys.stderr.write('Save back to \"invertedindex\"...') fout.write(str(self.__urlnum) + '\n') json.dump(self.index, fout, indent=1) # for key,value in self.index.items(): # fout.write(repr(key)+'\n') # fout.write(repr(value)+'\n') sys.stderr.write('[Success]' + '\n') except IOError as err: sys.stderr.write(str(err) + '\n') sys.stderr.write('Can not write back to \"invertedindex\"!!!' + '\n') return def __calculateTf_idf(self): sys.stderr.write('Calculating...') #tf-idf = (1+log(tf,base))*log(N/df,base) base = 10 for postingList in self.index.itervalues(): postingList[0] = len(postingList[1]) df = postingList[0] postingList[2] = 0 for record in postingList[1].itervalues(): tf = record[0] record[1] = (1 + math.log(tf, base)) * math.log( self.__urlnum / float(df), base) # sys.stderr.write( 'tf_idf=',record[1]+'\n') postingList[2] = postingList[2] + tf for i in xrange(self.__urlnum): #for every urlnum i = str(i) length = 0 for postingList in self.index.itervalues( ): #for every term records records = postingList[1] tf_idf = records.get(i, [0, 0])[1] length = length + tf_idf * tf_idf # if i in records: # sys.stderr.write( 'tf_idf = ', tf_idf+'\n') # sys.stderr.write( 'changed vector length =',length+'\n') length = math.sqrt(length) # sys.stderr.write( i, 'vector length =',length+'\n') for postingList in self.index.itervalues( ): #for every term records records = postingList[1] if i in records: record = records[i] record[1] = record[1] / length sys.stderr.write('[Success]' + '\n') return def __str__(self): import locale res = '' for key, val in self.index.items(): type(key) res = res + key.encode( locale.getpreferredencoding()) + ' ' + str( val[0]) + ' ' + '{' for i in xrange(self.__urlnum): res = res + str(i) + ':' + str(val[1].get(i, [0, 0.0])) res = res + '}' + '\n' return res def __repr__(self): return str(self)
class searcher: # IndexBuilder().index # ... = Parser() # ....normalize(str) #['word','word'...] # 定义构造方法 def __init__(self): self.__invertedindex = IndexBuilder().index self.pp = Parser() self.pp.normalize("a") self.pagerank = [] with open("urllist", "r") as f1: # 打开文件urllist self.__num1 = int(f1.readline()) # 总url数目 self.urllist = [] n = 0 while n < self.__num1: # 将url信息存入字典中 s = f1.readline() arr = s.split(" ") # urlid = int(arr[0]) #url ID url = arr[1] # url地址 indegree = int(arr[2]) # url入度:用于计算PageRank outdegree = int(arr[3]) # url出度 length_of_texts = int(arr[4]) self.urllist.append([url, indegree, outdegree, length_of_texts]) n = n + 1 with open("pagerank", "r") as file: for line in file: self.pagerank.append(float(line)) def search_cos(self, query, pagerank=True): querydict_tf = {} weight = {} scoredict = {} length = 0 heap = [] urlids = [] self.querylist = self.pp.normalize(query) totaldoc = len(self.urllist) for item in self.querylist: if item in querydict_tf: querydict_tf[item] += 1 else: querydict_tf[item] = 1 for item in querydict_tf.iterkeys(): if item in self.__invertedindex: weight[item] = (1.0 + math.log10(querydict_tf[item])) * math.log10( 1.0 * totaldoc / self.__invertedindex[item][0] ) else: weight[item] = 0 i = 0 while i < self.__num1: score = 0 for item in weight.iterkeys(): if item in self.__invertedindex and str(i) in self.__invertedindex[item][1]: score += weight[item] * self.__invertedindex[item][1][str(i)][1] if pagerank: score *= self.pagerank[i] uid = id_score(i, score) if uid.score > 0: if len(heap) <= 50: heapq.heappush(heap, uid) else: heapq.heappushpop(heap, uid) i += 1 # 输出 while len(heap) > 0: tmp = heapq.heappop(heap).urlid urlids.append(tmp) urlids.reverse() return urlids # boolean search def boolean(self, query): query = self.pp.normalize(query) # 解析query # character = [] # for term in query: # print type(term) # query.append(term) character_set = list(set(query)) # 去重 # 根据term的倒排索引数目排序 # character_set = [] # for term in character: # T = (term, len(self.__invertedindex[term][1])) # character_set.append(T) # character_set.sort(lambda x, y: cmp(x[1], y[1])) # 获取倒排文件索引 finalindex = self.__invertedindex.get(character_set[0], [0, {}, 0])[1].keys() # 获得第一个term的倒排文件索引 for term in character_set: if finalindex: index = self.__invertedindex.get(term, [0, {}, 0])[1].keys() # 获得第i个term的倒排文件索引 finalindex = list(set(finalindex) & set(index)) else: return finalindex heap = [] for url in finalindex: score = 0 for term in character_set: score = score + self.__invertedindex.get(term, [0, {}, 0])[1][url][0] heap.append(id_score(int(url), score)) heapq.heapify(heap) urlids = [] while len(heap) > 0: tmp = heapq.heappop(heap).urlid urlids.append(tmp) urlids.reverse() return urlids def gettitle(url): try: req_header = { "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6" } req = urllib2.Request(url, None, req_header) page = urllib2.urlopen(req, None, 54) html = page.read() page.close() soup = BeautifulSoup(html) title = soup.title title = title.string except Exception as e: print e title = None return title
attach[position[i + 1]] += 1 i += 1 else: notend = False else: cpl.append(phrase_list) return cpl if __name__ == '__main__': cc = correction() word = raw_input('输入纠正词: ').decode('utf-8') choose = input('输入选择:1.普通查找 2.精确查找\n') if choose == 1: pp = Parser() phrase_list = pp.normalize(word) else: if choose == 2: i = 0 phrase_list = [] while i < len(word): phrase_list.append(word[i]) i += 1 '''for ii in phrase_list: print(ii.encode('utf-8'))''' print('________________before correct________________') ll = cc.correct(phrase_list) if len(ll) == 0: print('no correct') else: for item in ll:
class correction: def __init__(self): self.pa=Parser() self.pp=PinYin() self.pp.load_word() with open(os.path.join(os.path.dirname(__file__),'pinyin_dict'),'r') as ff: line=ff.readline() self.jj_dict=json.loads(line) ff.close() def correct(self,word,choose): if choose==1: phrase_list=self.pa.normalize(word) else: if choose==2: i=0 phrase_list=[] while i < len(word): phrase_list.append(word[i]) i+=1 termlist=[] flag=False newplist=self.recompose(phrase_list) '''for item in newplist: for item2 in item: print(item2.encode('utf-8'))''' for nnlist in newplist: i=0 tmp_correct=[] correct_num=[] for item in nnlist: py=self.pp.hanzi2pinyin_split(item,'_') tmp=[] tmp_correct.append(tmp) if(py in self.jj_dict): for item2 in self.jj_dict[py]: tmp_correct[i].append(item2) else: tmp_correct[i].append((item,1)) correct_num.append(0) i+=1 length=len(tmp_correct) notend=True while notend: i=0 tmpstr='' score=0 for j in xrange(0,length): tmps=tmp_correct[j][correct_num[j]][0] tmpstr+=tmps score+=int(tmp_correct[j][correct_num[j]][1])*len(tmps)**7 termlist.append(tup(tmpstr,score)) correct_num[0]+=1 while correct_num[i]>=len(tmp_correct[i]): correct_num[i]=0 if i<length-1: correct_num[i+1]+=1 i+=1 else: notend=False result_list=self.sscore(termlist) comstr='' for item in phrase_list: comstr+=item if result_list[0]==comstr: result_list=[] else: if comstr in result_list: result_list.pop(result_list.index(comstr)) return result_list def sscore(self,termlist): heap=[] result_list=[] for item in termlist: heap.append(item) heapq.heapify(heap) while len(heap) > 5: a=heapq.heappop(heap) while len(heap) > 0: result_list.append(heapq.heappop(heap).term) result_list.reverse() return result_list def recompose(self,phrase_list): position=[] attach={} cpl=[] #the consequence:list of list i=0 #position of single word for item in phrase_list: if(len(item)==1): position.append(i) attach[i]=0 i+=1 notend=True length=len(position) if length>0: while notend: gap=0 tmp_list=copy.deepcopy(phrase_list) tmp_position=copy.deepcopy(position) pi=0 while pi < len(tmp_position): item2=tmp_position[pi] if(attach[item2]==0): if item2-1-gap>=0: tmp_list[item2-1-gap]+=tmp_list[item2-gap] k=tmp_position.index(item2) tmp_position.pop(k) tmp_list.pop(item2-gap) gap+=1 else: pi+=1 '''while k < len(tmp_position): tmp_position[k]-=gap #print(tmp_position[k]) attach[tmp_position[k]]=attach[tmp_position[k]+gap] k+=1''' else: if attach[item2]==1: if item2+1-gap<len(tmp_list): tmp_list[item2+1-gap]=tmp_list[item2-gap]+tmp_list[item2+1-gap] k=tmp_position.index(item2) tmp_position.pop(k) tmp_list.pop(item2-gap) gap+=1 if item2+1 in tmp_position: tmp_position.pop(tmp_position.index(item2+1)) else: pi+=1 else: pi+=1 '''while k < len(tmp_position): tmp_position[k]-=gap attach[tmp_position[k]]=attach[tmp_position[k]+gap] k+=1''' '''flag=True for item3 in tmp_list: if len(item3)==1: flag=False if flag:''' if tmp_list not in cpl: cpl.append(tmp_list) attach[position[0]]+=1 #每次变换一个 i=0 while attach[position[i]]>=3: attach[position[i]]=0 if i<length-1: attach[position[i+1]]+=1 i+=1 else: notend=False else: cpl.append(phrase_list) return cpl
class searcher: # 定义构造方法 def __init__(self): self.__invertedindex = IndexBuilder().index self.parser = Parser() self.parser.normalize('a') self.dictionary = None self.totallength = 0 self.lave = 0 self.roll_index = VocabTree() with open(os.path.join(os.path.dirname(__file__), 'urllist'), 'r') as f1: # 打开文件urllist self.__urlnum = int(f1.readline()) # 总url数目 self.urllist = [] n = 0 while n < self.__urlnum: # 将url信息存入字典中 s = f1.readline() arr = s.split(' ') # urlid = int(arr[0]) #url ID url = arr[1] # url地址 indegree = int(arr[2]) # url入度:用于计算PageRank outdegree = int(arr[3]) # url出度 length_of_texts = int(arr[4]) self.urllist.append( [url, indegree, outdegree, length_of_texts]) n = n + 1 self.totallength += length_of_texts self.lave = self.totallength / self.__urlnum with open(os.path.join(os.path.dirname(__file__), 'htmls'), 'r') as file: self.htmls = json.load(file) # [ # [title, text], # [title, text], # ] with open(os.path.join(os.path.dirname(__file__), 'dictionary'), 'r') as file: self.dictionary = json.load(file) #todo: 轮盘索引 sys.stderr.write('Building roll index...') for word in self.dictionary: for i in range(len(word) + 1): self.roll_index.add_word(word[i:] + '$' + word[:i]) sys.stderr.write('[Success]\n') def search_cos(self, query, k=50): querydict_tf = {} weight = {} scoredict = {} length = 0 heap = [] urlids = [] self.querylist = self.parser.normalize(query) totaldoc = len(self.urllist) for item in self.querylist: if (item in querydict_tf): querydict_tf[item] += 1 else: querydict_tf[item] = 1 for item in querydict_tf.iterkeys(): if (item in self.__invertedindex): weight[item] = ( 1.0 + math.log10(querydict_tf[item])) * math.log10( 1.0 * totaldoc / self.__invertedindex[item][0]) else: weight[item] = 0 i = 0 for i in range(self.__urlnum): score = 0 for item in weight.iterkeys(): if (item in self.__invertedindex and str(i) in self.__invertedindex[item][1]): score += weight[item] * self.__invertedindex[item][1][str( i)][1] uid = id_score(i, score) if (uid.score > 0): heap.append(uid) # 输出 heapq.heapify(heap) for i in range(k): if heap: urlids.append(heapq.heappop(heap).urlid) return urlids def abstract(self, query, urlid): query_list = self.parser.normalize(query) result_list = [] for item in query_list: index = self.htmls[urlid][1].lower().find(item) if index != -1: result_list.append([]) start = -int(random.random() * 10) length = 15 - start * 2 ll = len(item) if index >= -start: i = start a = 0 result_list[len(result_list) - 1].append('') while i < start + length and index + i < len( self.htmls[urlid][1]): if i == 0: a += 1 result_list[len(result_list) - 1].append('') if i == ll: a += 1 result_list[len(result_list) - 1].append('') result_list[len(result_list) - 1][a] += self.htmls[urlid][1][index + i] i += 1 if i <= ll: result_list[len(result_list) - 1].append('') else: i = 0 a = 0 result_list[len(result_list) - 1].append('') while i < length and index + i < len(self.htmls[urlid][1]): if i == 0: a = 0 result_list[len(result_list) - 1].append('') if i == ll: a = 0 result_list[len(result_list) - 1].append('') result_list[len(result_list) - 1][a] += self.htmls[urlid][1][index + i] i += 1 if i <= ll: result_list[len(result_list) - 1].append('') return result_list # boolean search def boolean(self, query, k=50): def query_to_tree(query): text2token = r'AND|OR|NOT|\w+|\(|\)' token2tag = { 'AND': 'AND', 'OR': 'OR', 'NOT': 'NOT', '(': 'LP', ')': 'RP' } grammar = """ exp -> orexp orexp -> orexp "OR" andexp orexp -> andexp andexp -> andexp "AND" notexp andexp -> andexp notexp andexp -> notexp notexp -> "NOT" metaexp notexp -> metaexp metaexp -> "LP" exp "RP" metaexp -> "#TERM#" """ token = nltk.regexp_tokenize(query, text2token) tags = [token2tag.get(t, '#TERM#') for t in token] terms = [ t for t in token if t not in ['AND', 'OR', 'NOT', '(', ')'] ] parser = nltk.ChartParser(nltk.CFG.fromstring(grammar)) for tree in parser.parse(tags): treestr = str(tree) for t in terms: treestr = treestr.replace("#TERM#", t, 1) tree = nltk.Tree.fromstring(treestr) return tree def traversal(tree): def dict_or(id_score_dict1, id_score_dict2): rval = {} for key in set(id_score_dict1.keys()).union( id_score_dict2.keys()): rval[key] = id_score_dict1.get( key, 0) + id_score_dict2.get(key, 0) return rval def dict_and(id_score_dict1, id_score_dict2): rval = {} for key in set(id_score_dict1.keys()).intersection( id_score_dict2.keys()): rval[key] = min(id_score_dict1.get(key), id_score_dict2.get(key)) return rval def dict_not(id_score_dict): return { url: 0 for url in {str(url) for url in range(len(self.urllist))} - set(id_score_dict.keys()) } def word2dict(word): term = self.parser.stem(word) return { urlid: tf_idf[0] for urlid, tf_idf in self.__invertedindex.get( term, [0, {}, 0])[1].iteritems() } if isinstance(tree, str) or isinstance(tree, unicode): return word2dict(tree) elif len(tree) == 1: return traversal(tree[0]) elif tree.label() == 'orexp': assert tree[1] == 'OR' return dict_or(traversal(tree[0]), traversal(tree[2])) elif tree.label() == 'andexp': if tree[1] == 'AND': return dict_and(traversal(tree[0]), traversal(tree[2])) else: return dict_and(traversal(tree[0]), traversal(tree[1])) elif tree.label() == 'notexp': assert tree[0] == 'NOT' return dict_not(traversal(tree[1])) elif tree.label() == 'metaexp': assert tree[0] == 'LP' assert tree[2] == 'RP' return traversal(tree[1]) if not self.parser.normalize(query): return [] tree = query_to_tree(query) url_score_dict = traversal(tree) heap = [] for url, socre in url_score_dict.iteritems(): heap.append(id_score(int(url), socre)) # finalindex = self.__invertedindex.get(character_set[0], [0, {}, 0])[1].keys() # 获得第一个term的倒排文件索引 # for term in character_set: # if finalindex: # index = self.__invertedindex.get(term, [0, {}, 0])[1].keys() # 获得第i个term的倒排文件索引 # finalindex = list(set(finalindex) & set(index)) # else: # return finalindex # # heap = [] # for url in finalindex: # score = 0 # for term in character_set: # score = score + self.__invertedindex.get(term, [0, {}, 0])[1][url][0] # heap.append(id_score(int(url), score)) urlids = [] heapq.heapify(heap) for i in range(k): if heap: urlids.append(heapq.heappop(heap).urlid) return urlids def search_rsv(self, query, k=50): k1 = 1.5 k3 = 1.5 b = 0.75 querydict_tf = {} weight = {} scoredict = {} length = 0 heap = [] urlids = [] self.querylist = self.parser.normalize(query) totaldoc = len(self.urllist) for item in self.querylist: if (item in querydict_tf): querydict_tf[item] += 1 else: querydict_tf[item] = 1 i = 0 for i in range(self.__urlnum): score = 0 for item in querydict_tf.iterkeys(): if (item in self.__invertedindex and str(i) in self.__invertedindex[item][1]): score += math.log10(1.0 * self.__urlnum / self.__invertedindex[item][0]) * (k1 + 1) * \ self.__invertedindex[item][1][str(i)][0] / ( k1 * ((1 - b) + b * (1.0 * self.urllist[i][3] / self.lave)) + self.__invertedindex[item][1][str(i)][0]) * (k3 + 1) * querydict_tf[item] / ( k3 + querydict_tf[item]) uid = id_score(i, score) if (uid.score > 0): heap.append(uid) # if (len(heap) <= 50): # heapq.heappush(heap, uid) # else: # heapq.heappushpop(heap, uid) # 输出 # while len(heap) > 0: # tmp = heapq.heappop(heap).urlid # urlids.append(tmp) # urlids.reverse() # return urlids heapq.heapify(heap) for i in range(k): if heap: urlids.append(heapq.heappop(heap).urlid) return urlids def lm( self, query, k=50, ): querydict_tf = {} weight = {} scoredict = {} length = 0 heap = [] urlids = [] lam = 0.8 self.querylist = self.parser.normalize(query) totaldoc = len(self.urllist) for item in self.querylist: if (item in querydict_tf): querydict_tf[item] += 1 else: querydict_tf[item] = 1 for item in querydict_tf.iterkeys(): if (item in self.__invertedindex): weight[item] = ( 1.0 + math.log10(querydict_tf[item])) * math.log10( 1.0 * totaldoc / self.__invertedindex[item][0]) else: weight[item] = 0 i = 0 for i in range(self.__urlnum): score = 1 for item in weight.iterkeys(): if (item in self.__invertedindex and str(i) in self.__invertedindex[item][1]): a = float(self.__invertedindex[item][1][str(i)] [0]) / self.urllist[i][3] b = float(self.__invertedindex[item][2]) / self.totallength score *= (lam * a + (1 - lam) * b)**weight[item] else: score = 0 uid = id_score(i, score) if (uid.score > 0): heap.append(uid) # 输出 # while len(heap) > 0: # tmp = heapq.heappop(heap).urlid # urlids.append(tmp) # urlids.reverse() # return urlids heapq.heapify(heap) for i in range(k): if heap: urlids.append(heapq.heappop(heap).urlid) return urlids def gettitle(self, url): return u'in_gettitle' # try: # req_header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} # req = urllib2.Request(url,None,req_header) # page = urllib2.urlopen(req,None,54) # html = page.read() # page.close() # soup = BeautifulSoup(html, 'lxml') # title = soup.title # title = title.string # except Exception as e: # print e # title = None # return title def word_correct(self, word): word = self.parser.normalize(word) word_list = [] term = self.__invertedindex.keys() # todo correction return word_list def wildcard2word(self, wildcard): def derolled(word): assert '$' in word first, second = word.split('$') return second + first assert '*' in wildcard first, second = wildcard.split('*') rolled_word = second + '$' + first return map(derolled, self.roll_index.find_by_prefix(rolled_word))