def __init__(self, urllist='urllist', invertedindex='invertedindex', htmls='htmls'): # 将主网址加入集合 self.urllistName = os.path.join(os.path.dirname(__file__), urllist) self.htmlsName = os.path.join(os.path.dirname(__file__), htmls) self.indexbuilder = IndexBuilder(invertedindex) self.urls = [] self.htmls = [] self.length = []
def __init__(self,baseurl='http://www.cc98.org',urllist='urllist',queue='queue',invertedindex='invertedindex',graph='graph'):#将主网址加入集合 self.baseurl=baseurl self.queueName = queue self.urllistName = urllist self.graphName = graph if os.path.exists(self.urllistName) and os.path.exists(self.queueName) and os.path.exists(self.graphName): self.indexbuilder = IndexBuilder(invertedindex) self.fillset(self.urllistName, self.queueName, self.graphName) #检查是否继续上次爬取 else: self.indexbuilder = IndexBuilder() pass
class crawl: def __init__(self, urllist='urllist', invertedindex='invertedindex', htmls='htmls'): # 将主网址加入集合 self.urllistName = os.path.join(os.path.dirname(__file__), urllist) self.htmlsName = os.path.join(os.path.dirname(__file__), htmls) self.indexbuilder = IndexBuilder(invertedindex) self.urls = [] self.htmls = [] self.length = [] def process(self): file_names = urllib2.urlopen("http://127.0.0.1:9000/index.txt").read().split() for file_name in file_names: try: url = "http://127.0.0.1:9000/{}".format(file_name) content = urllib2.urlopen(url).read() title = content[:content.find("\n")] self.urls.append(url) self.length.append(self.indexbuilder.process(content, title, len(self.urls) - 1)) self.htmls.append([title, content]) except urllib2.URLError as e: print(type(e), e.message, e.args) print(url) except socket.timeout as e: print(type(e), e.message, e.args) print(url) except Exception as e: print(e) print(url) def save(self): self.indexbuilder.save() json.dump(self.htmls, open(self.htmlsName, 'w')) with open(self.urllistName, 'w') as uu: uu.write('%d\n' % (len(self.urls))) i = 0 print('Writing urllist back into file...') for item in self.urls: try: uu.write('%d %s %d %d %d\n' % (i, item, 0, 0, self.length[i])) i += 1 except: sys.stderr.write( '%d %s\n' % (i, item) + '\n') sys.stderr.write('urls output wrong\n')
def __init__(self): self.__invertedindex = IndexBuilder().index self.parser = Parser() self.parser.normalize('a') self.dictionary = None self.totallength = 0 self.lave = 0 self.roll_index = VocabTree() with open(os.path.join(os.path.dirname(__file__), 'urllist'), 'r') as f1: # 打开文件urllist self.__urlnum = int(f1.readline()) # 总url数目 self.urllist = [] n = 0 while n < self.__urlnum: # 将url信息存入字典中 s = f1.readline() arr = s.split(' ') # urlid = int(arr[0]) #url ID url = arr[1] # url地址 indegree = int(arr[2]) # url入度:用于计算PageRank outdegree = int(arr[3]) # url出度 length_of_texts = int(arr[4]) self.urllist.append( [url, indegree, outdegree, length_of_texts]) n = n + 1 self.totallength += length_of_texts self.lave = self.totallength / self.__urlnum with open(os.path.join(os.path.dirname(__file__), 'htmls'), 'r') as file: self.htmls = json.load(file) # [ # [title, text], # [title, text], # ] with open(os.path.join(os.path.dirname(__file__), 'dictionary'), 'r') as file: self.dictionary = json.load(file) #todo: 轮盘索引 sys.stderr.write('Building roll index...') for word in self.dictionary: for i in range(len(word) + 1): self.roll_index.add_word(word[i:] + '$' + word[:i]) sys.stderr.write('[Success]\n')
def __init__(self): self.__invertedindex = IndexBuilder().index self.pp = Parser() self.pp.normalize("a") self.pagerank = [] with open("urllist", "r") as f1: # 打开文件urllist self.__num1 = int(f1.readline()) # 总url数目 self.urllist = [] n = 0 while n < self.__num1: # 将url信息存入字典中 s = f1.readline() arr = s.split(" ") # urlid = int(arr[0]) #url ID url = arr[1] # url地址 indegree = int(arr[2]) # url入度:用于计算PageRank outdegree = int(arr[3]) # url出度 length_of_texts = int(arr[4]) self.urllist.append([url, indegree, outdegree, length_of_texts]) n = n + 1 with open("pagerank", "r") as file: for line in file: self.pagerank.append(float(line))
class searcher: # IndexBuilder().index # ... = Parser() # ....normalize(str) #['word','word'...] # 定义构造方法 def __init__(self): self.__invertedindex = IndexBuilder().index self.pp = Parser() self.pp.normalize("a") self.pagerank = [] with open("urllist", "r") as f1: # 打开文件urllist self.__num1 = int(f1.readline()) # 总url数目 self.urllist = [] n = 0 while n < self.__num1: # 将url信息存入字典中 s = f1.readline() arr = s.split(" ") # urlid = int(arr[0]) #url ID url = arr[1] # url地址 indegree = int(arr[2]) # url入度:用于计算PageRank outdegree = int(arr[3]) # url出度 length_of_texts = int(arr[4]) self.urllist.append([url, indegree, outdegree, length_of_texts]) n = n + 1 with open("pagerank", "r") as file: for line in file: self.pagerank.append(float(line)) def search_cos(self, query, pagerank=True): querydict_tf = {} weight = {} scoredict = {} length = 0 heap = [] urlids = [] self.querylist = self.pp.normalize(query) totaldoc = len(self.urllist) for item in self.querylist: if item in querydict_tf: querydict_tf[item] += 1 else: querydict_tf[item] = 1 for item in querydict_tf.iterkeys(): if item in self.__invertedindex: weight[item] = (1.0 + math.log10(querydict_tf[item])) * math.log10( 1.0 * totaldoc / self.__invertedindex[item][0] ) else: weight[item] = 0 i = 0 while i < self.__num1: score = 0 for item in weight.iterkeys(): if item in self.__invertedindex and str(i) in self.__invertedindex[item][1]: score += weight[item] * self.__invertedindex[item][1][str(i)][1] if pagerank: score *= self.pagerank[i] uid = id_score(i, score) if uid.score > 0: if len(heap) <= 50: heapq.heappush(heap, uid) else: heapq.heappushpop(heap, uid) i += 1 # 输出 while len(heap) > 0: tmp = heapq.heappop(heap).urlid urlids.append(tmp) urlids.reverse() return urlids # boolean search def boolean(self, query): query = self.pp.normalize(query) # 解析query # character = [] # for term in query: # print type(term) # query.append(term) character_set = list(set(query)) # 去重 # 根据term的倒排索引数目排序 # character_set = [] # for term in character: # T = (term, len(self.__invertedindex[term][1])) # character_set.append(T) # character_set.sort(lambda x, y: cmp(x[1], y[1])) # 获取倒排文件索引 finalindex = self.__invertedindex.get(character_set[0], [0, {}, 0])[1].keys() # 获得第一个term的倒排文件索引 for term in character_set: if finalindex: index = self.__invertedindex.get(term, [0, {}, 0])[1].keys() # 获得第i个term的倒排文件索引 finalindex = list(set(finalindex) & set(index)) else: return finalindex heap = [] for url in finalindex: score = 0 for term in character_set: score = score + self.__invertedindex.get(term, [0, {}, 0])[1][url][0] heap.append(id_score(int(url), score)) heapq.heapify(heap) urlids = [] while len(heap) > 0: tmp = heapq.heappop(heap).urlid urlids.append(tmp) urlids.reverse() return urlids def gettitle(url): try: req_header = { "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6" } req = urllib2.Request(url, None, req_header) page = urllib2.urlopen(req, None, 54) html = page.read() page.close() soup = BeautifulSoup(html) title = soup.title title = title.string except Exception as e: print e title = None return title
class crawl: baseurl='' req_header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} req_timeout = 54 urlqueue=[] urls=[] indegree=[] outdegree=[] length = [] head=[] totalcount=0 count=0 read_web=set() graph = [] def __init__(self,baseurl='http://www.cc98.org',urllist='urllist',queue='queue',invertedindex='invertedindex',graph='graph'):#将主网址加入集合 self.baseurl=baseurl self.queueName = queue self.urllistName = urllist self.graphName = graph if os.path.exists(self.urllistName) and os.path.exists(self.queueName) and os.path.exists(self.graphName): self.indexbuilder = IndexBuilder(invertedindex) self.fillset(self.urllistName, self.queueName, self.graphName) #检查是否继续上次爬取 else: self.indexbuilder = IndexBuilder() pass def user_agent(self, loopnum): #宽度优先遍历网页 if self.urlqueue: url_parent = self.urlqueue.pop(0) url = url_parent[0] parent = url_parent[1] else: url = self.baseurl parent = self.baseurl while self.count < loopnum: try: if(url in self.read_web): try: urlid = self.urls.index(url) except Exception as e: print e try: self.indegree[urlid]+=1 except Exception as e: print e try: self.graph[urlid][self.urls.index(parent)] = 1 except Exception as e: print e,e.args else: self.read_web.add(url) tmpoutdegree=0 print("it's the %d time"%(self.count)) self.count=self.count+1#搜索网页数 req = urllib2.Request(url,None,self.req_header) page = urllib2.urlopen(req,None,self.req_timeout) html = page.read() page.close() soup = BeautifulSoup(html) self.urls.append(url) self.graph.append([]) for i in xrange(len(self.urls)-1): self.graph[i].append(0) self.graph[len(self.urls)-1].append(0) self.graph[len(self.urls)-1].append(0) self.indegree.append(1) self.graph[len(self.graph)-1][self.urls.index(parent)] = 1 self.length.append(self.indexbuilder.process(soup,len(self.urls)-1)) a = soup.find_all(['a']) for i in a: suburl=i.get('href') if(suburl is not None and suburl.find('javascript')==-1): if(suburl.find('http')==-1): suburl=self.baseurl+'/'+suburl if(suburl.find('www.cc98.org')!=-1): # print(suburl) self.urlqueue.append([suburl,url]) suburl='' tmpoutdegree=tmpoutdegree+1 #c=raw_input() self.outdegree.append(tmpoutdegree) time.sleep(0.2) except urllib2.URLError as e: print type(e), e.message, e.args print url except socket.timeout as e: print type(e), e.message, e.args print url except Exception as e: print e print url if(len(self.urlqueue)>0): url_parent = self.urlqueue.pop(0) url = url_parent[0] parent = url_parent[1] #结束了 def save(self): self.indexbuilder.save() with open(self.queueName,'w') as qq: sys.stderr.write('Writing queue back into file...\n') for item in self.urlqueue: try: if(item is not None): qq.write(item[0]+' '+item[1]+'\n') except: sys.stderr.write('queue wrong but things well\n') pass with open(self.urllistName,'w') as uu: uu.write('%d\n'%(len(self.urls))) i=0 print('Writing urllist back into file...') for item in self.urls: try: uu.write('%d %s %d %d %d\n'%(i, item, self.indegree[i], self.outdegree[i], self.length[i])) i+=1 except: sys.stderr.write('%d %s %d %d %d\n'%(i, item, self.indegree[i], self.outdegree[i], self.length[i])+'\n') sys.stderr.write('urls output wrong\n') pass #return html with open(self.graphName,'w') as gg: print('Writing graph back into file...') try: json.dump(self.graph,gg,indent = 1) except Exception as e: sys.stderr.write(repr(e)+'\n') sys.stderr.write('Graph store error\n') with open('graph.txt','w') as file: print('Writing graph.txt back into file...') try: for i in xrange(len(self.graph)): for j in xrange(len(self.graph)): if self.graph[i][j] == 1: file.write(str(i)+' '+str(j)+' ') file.write('\n') except Exception as e: sys.stderr.write(repr(e)+'\n') sys.stderr.write('Graph.txt write error\n') def fillset(self,urllist,queue,graph):#将以前访问过的网站加入set,重新获取queue sys.stderr.write('Reload queue, urllist, graph...') with open(urllist,'r') as FILE: totalcount=FILE.readline() for item in FILE.readlines(): try: (tmpid,tmpurl,tmpind,tmpoud,tmplen)=(item.strip('\n').split(' ')) self.urls.append(tmpurl) self.read_web.add(tmpurl) self.indegree.append(int(tmpind)) self.outdegree.append(int(tmpoud)) self.length.append(int(tmplen)) except Exception as e: sys.stderr.write(repr(e)) sys.stderr.write('read in data error\n') with open(queue,'r') as FILE: for item in FILE.readlines(): try: self.urlqueue.append(item.strip('\n').split(' ')) except Exception as e: sys.stderr.write(repr(e)) sys.stderr.write('read queue in error but well\n') with open(graph,'r') as FILE: try: self.graph = json.load(FILE) except Exception as e: sys.stderr.write(repr(e)) sys.stderr.write('Read graph error\n') sys.stderr.write('[Success]\n')
class searcher: # 定义构造方法 def __init__(self): self.__invertedindex = IndexBuilder().index self.parser = Parser() self.parser.normalize('a') self.dictionary = None self.totallength = 0 self.lave = 0 self.roll_index = VocabTree() with open(os.path.join(os.path.dirname(__file__), 'urllist'), 'r') as f1: # 打开文件urllist self.__urlnum = int(f1.readline()) # 总url数目 self.urllist = [] n = 0 while n < self.__urlnum: # 将url信息存入字典中 s = f1.readline() arr = s.split(' ') # urlid = int(arr[0]) #url ID url = arr[1] # url地址 indegree = int(arr[2]) # url入度:用于计算PageRank outdegree = int(arr[3]) # url出度 length_of_texts = int(arr[4]) self.urllist.append( [url, indegree, outdegree, length_of_texts]) n = n + 1 self.totallength += length_of_texts self.lave = self.totallength / self.__urlnum with open(os.path.join(os.path.dirname(__file__), 'htmls'), 'r') as file: self.htmls = json.load(file) # [ # [title, text], # [title, text], # ] with open(os.path.join(os.path.dirname(__file__), 'dictionary'), 'r') as file: self.dictionary = json.load(file) #todo: 轮盘索引 sys.stderr.write('Building roll index...') for word in self.dictionary: for i in range(len(word) + 1): self.roll_index.add_word(word[i:] + '$' + word[:i]) sys.stderr.write('[Success]\n') def search_cos(self, query, k=50): querydict_tf = {} weight = {} scoredict = {} length = 0 heap = [] urlids = [] self.querylist = self.parser.normalize(query) totaldoc = len(self.urllist) for item in self.querylist: if (item in querydict_tf): querydict_tf[item] += 1 else: querydict_tf[item] = 1 for item in querydict_tf.iterkeys(): if (item in self.__invertedindex): weight[item] = ( 1.0 + math.log10(querydict_tf[item])) * math.log10( 1.0 * totaldoc / self.__invertedindex[item][0]) else: weight[item] = 0 i = 0 for i in range(self.__urlnum): score = 0 for item in weight.iterkeys(): if (item in self.__invertedindex and str(i) in self.__invertedindex[item][1]): score += weight[item] * self.__invertedindex[item][1][str( i)][1] uid = id_score(i, score) if (uid.score > 0): heap.append(uid) # 输出 heapq.heapify(heap) for i in range(k): if heap: urlids.append(heapq.heappop(heap).urlid) return urlids def abstract(self, query, urlid): query_list = self.parser.normalize(query) result_list = [] for item in query_list: index = self.htmls[urlid][1].lower().find(item) if index != -1: result_list.append([]) start = -int(random.random() * 10) length = 15 - start * 2 ll = len(item) if index >= -start: i = start a = 0 result_list[len(result_list) - 1].append('') while i < start + length and index + i < len( self.htmls[urlid][1]): if i == 0: a += 1 result_list[len(result_list) - 1].append('') if i == ll: a += 1 result_list[len(result_list) - 1].append('') result_list[len(result_list) - 1][a] += self.htmls[urlid][1][index + i] i += 1 if i <= ll: result_list[len(result_list) - 1].append('') else: i = 0 a = 0 result_list[len(result_list) - 1].append('') while i < length and index + i < len(self.htmls[urlid][1]): if i == 0: a = 0 result_list[len(result_list) - 1].append('') if i == ll: a = 0 result_list[len(result_list) - 1].append('') result_list[len(result_list) - 1][a] += self.htmls[urlid][1][index + i] i += 1 if i <= ll: result_list[len(result_list) - 1].append('') return result_list # boolean search def boolean(self, query, k=50): def query_to_tree(query): text2token = r'AND|OR|NOT|\w+|\(|\)' token2tag = { 'AND': 'AND', 'OR': 'OR', 'NOT': 'NOT', '(': 'LP', ')': 'RP' } grammar = """ exp -> orexp orexp -> orexp "OR" andexp orexp -> andexp andexp -> andexp "AND" notexp andexp -> andexp notexp andexp -> notexp notexp -> "NOT" metaexp notexp -> metaexp metaexp -> "LP" exp "RP" metaexp -> "#TERM#" """ token = nltk.regexp_tokenize(query, text2token) tags = [token2tag.get(t, '#TERM#') for t in token] terms = [ t for t in token if t not in ['AND', 'OR', 'NOT', '(', ')'] ] parser = nltk.ChartParser(nltk.CFG.fromstring(grammar)) for tree in parser.parse(tags): treestr = str(tree) for t in terms: treestr = treestr.replace("#TERM#", t, 1) tree = nltk.Tree.fromstring(treestr) return tree def traversal(tree): def dict_or(id_score_dict1, id_score_dict2): rval = {} for key in set(id_score_dict1.keys()).union( id_score_dict2.keys()): rval[key] = id_score_dict1.get( key, 0) + id_score_dict2.get(key, 0) return rval def dict_and(id_score_dict1, id_score_dict2): rval = {} for key in set(id_score_dict1.keys()).intersection( id_score_dict2.keys()): rval[key] = min(id_score_dict1.get(key), id_score_dict2.get(key)) return rval def dict_not(id_score_dict): return { url: 0 for url in {str(url) for url in range(len(self.urllist))} - set(id_score_dict.keys()) } def word2dict(word): term = self.parser.stem(word) return { urlid: tf_idf[0] for urlid, tf_idf in self.__invertedindex.get( term, [0, {}, 0])[1].iteritems() } if isinstance(tree, str) or isinstance(tree, unicode): return word2dict(tree) elif len(tree) == 1: return traversal(tree[0]) elif tree.label() == 'orexp': assert tree[1] == 'OR' return dict_or(traversal(tree[0]), traversal(tree[2])) elif tree.label() == 'andexp': if tree[1] == 'AND': return dict_and(traversal(tree[0]), traversal(tree[2])) else: return dict_and(traversal(tree[0]), traversal(tree[1])) elif tree.label() == 'notexp': assert tree[0] == 'NOT' return dict_not(traversal(tree[1])) elif tree.label() == 'metaexp': assert tree[0] == 'LP' assert tree[2] == 'RP' return traversal(tree[1]) if not self.parser.normalize(query): return [] tree = query_to_tree(query) url_score_dict = traversal(tree) heap = [] for url, socre in url_score_dict.iteritems(): heap.append(id_score(int(url), socre)) # finalindex = self.__invertedindex.get(character_set[0], [0, {}, 0])[1].keys() # 获得第一个term的倒排文件索引 # for term in character_set: # if finalindex: # index = self.__invertedindex.get(term, [0, {}, 0])[1].keys() # 获得第i个term的倒排文件索引 # finalindex = list(set(finalindex) & set(index)) # else: # return finalindex # # heap = [] # for url in finalindex: # score = 0 # for term in character_set: # score = score + self.__invertedindex.get(term, [0, {}, 0])[1][url][0] # heap.append(id_score(int(url), score)) urlids = [] heapq.heapify(heap) for i in range(k): if heap: urlids.append(heapq.heappop(heap).urlid) return urlids def search_rsv(self, query, k=50): k1 = 1.5 k3 = 1.5 b = 0.75 querydict_tf = {} weight = {} scoredict = {} length = 0 heap = [] urlids = [] self.querylist = self.parser.normalize(query) totaldoc = len(self.urllist) for item in self.querylist: if (item in querydict_tf): querydict_tf[item] += 1 else: querydict_tf[item] = 1 i = 0 for i in range(self.__urlnum): score = 0 for item in querydict_tf.iterkeys(): if (item in self.__invertedindex and str(i) in self.__invertedindex[item][1]): score += math.log10(1.0 * self.__urlnum / self.__invertedindex[item][0]) * (k1 + 1) * \ self.__invertedindex[item][1][str(i)][0] / ( k1 * ((1 - b) + b * (1.0 * self.urllist[i][3] / self.lave)) + self.__invertedindex[item][1][str(i)][0]) * (k3 + 1) * querydict_tf[item] / ( k3 + querydict_tf[item]) uid = id_score(i, score) if (uid.score > 0): heap.append(uid) # if (len(heap) <= 50): # heapq.heappush(heap, uid) # else: # heapq.heappushpop(heap, uid) # 输出 # while len(heap) > 0: # tmp = heapq.heappop(heap).urlid # urlids.append(tmp) # urlids.reverse() # return urlids heapq.heapify(heap) for i in range(k): if heap: urlids.append(heapq.heappop(heap).urlid) return urlids def lm( self, query, k=50, ): querydict_tf = {} weight = {} scoredict = {} length = 0 heap = [] urlids = [] lam = 0.8 self.querylist = self.parser.normalize(query) totaldoc = len(self.urllist) for item in self.querylist: if (item in querydict_tf): querydict_tf[item] += 1 else: querydict_tf[item] = 1 for item in querydict_tf.iterkeys(): if (item in self.__invertedindex): weight[item] = ( 1.0 + math.log10(querydict_tf[item])) * math.log10( 1.0 * totaldoc / self.__invertedindex[item][0]) else: weight[item] = 0 i = 0 for i in range(self.__urlnum): score = 1 for item in weight.iterkeys(): if (item in self.__invertedindex and str(i) in self.__invertedindex[item][1]): a = float(self.__invertedindex[item][1][str(i)] [0]) / self.urllist[i][3] b = float(self.__invertedindex[item][2]) / self.totallength score *= (lam * a + (1 - lam) * b)**weight[item] else: score = 0 uid = id_score(i, score) if (uid.score > 0): heap.append(uid) # 输出 # while len(heap) > 0: # tmp = heapq.heappop(heap).urlid # urlids.append(tmp) # urlids.reverse() # return urlids heapq.heapify(heap) for i in range(k): if heap: urlids.append(heapq.heappop(heap).urlid) return urlids def gettitle(self, url): return u'in_gettitle' # try: # req_header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} # req = urllib2.Request(url,None,req_header) # page = urllib2.urlopen(req,None,54) # html = page.read() # page.close() # soup = BeautifulSoup(html, 'lxml') # title = soup.title # title = title.string # except Exception as e: # print e # title = None # return title def word_correct(self, word): word = self.parser.normalize(word) word_list = [] term = self.__invertedindex.keys() # todo correction return word_list def wildcard2word(self, wildcard): def derolled(word): assert '$' in word first, second = word.split('$') return second + first assert '*' in wildcard first, second = wildcard.split('*') rolled_word = second + '$' + first return map(derolled, self.roll_index.find_by_prefix(rolled_word))