def getHtmlContent(target_url, header_type): config = Config() try: if header_type == 'sougou': send_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Connection': 'keep-alive', 'Cookie': 'com_sohu_websearch_ITEM_PER_PAGE='+str(config.getValue("pagesize", "sougou")) } else: send_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Connection': 'keep-alive' } if sys.version > '3': req = urllib.request.Request(target_url, headers=send_headers) response = urllib.request.urlopen(req, timeout=10) else: req = urllib2.Request(target_url, headers=send_headers) response = urllib2.urlopen(req, timeout=30) # print get_request.info() return response.read().decode('utf-8') except Exception as e: print("Get html page content error:" + e.message)
def __init__(self, module, page, keyword): config = Config() pagesize = config.getValue("pagesize", module) print("\033[1;37;40m[*]Search Engine [%s] starting!The number of display bars per page is %s" % (module, pagesize)) myps = multiprocessing.Process(target=Collect, args=(module, page, pagesize, keyword,)) myps.start() processList = gol.get_value("process") processList.append(myps) gol.set_value("process", processList)
class Collect(object): def __init__(self, module, page, pagesize, keyword): self.config = Config() self.module = module self.page = int(page) self.keyword = keyword self.pageSize = int(pagesize) self.saveFile = self.config.getValue("global", "savefile") if self.saveFile == 'True': self.outfile = OutFile(unquote(self.keyword)) else: self.outfile = None self.collection() def collection(self): for i in range(self.page): print("\033[1;37;40m[*]Search Engine [%s],Page [%s] Start collecting." % (self.module, i+1)) page_pn = (i * self.pageSize) if self.module == "baidu": my_baidu = Baidu(self.outfile) my_baidu.search(self.keyword, self.pageSize, page_pn) elif self.module == "so": my_so = So(self.outfile) my_so.search(self.keyword, i+1) elif self.module == "sougou": my_sougou = Sougou(self.outfile) my_sougou.search(self.keyword, i+1) if self.config.sleeptime > 0: time.sleep(self.config.sleeptime) if self.outfile == 'True': self.outfile.closeFile()
class Collect(object): def __init__(self, module, page, pagesize, keyword): self.config = Config() self.module = module self.page = int(page) self.keyword = keyword self.pageSize = int(pagesize) self.saveFile = self.config.getValue("global", "savefile") if self.saveFile == 'True': self.outfile = OutFile(unquote(self.keyword)) else: self.outfile = None self.collection() def collection(self): for i in range(self.page): print( "\033[1;37;40m[*]Search Engine [%s],Page [%s] Start collecting." % (self.module, i + 1)) page_pn = (i * self.pageSize) if self.module == "baidu": my_baidu = Baidu(self.outfile) my_baidu.search(self.keyword, self.pageSize, page_pn) elif self.module == "so": my_so = So(self.outfile) my_so.search(self.keyword, i + 1) elif self.module == "sougou": my_sougou = Sougou(self.outfile) my_sougou.search(self.keyword, i + 1) if self.config.sleeptime > 0: time.sleep(self.config.sleeptime) if self.outfile == 'True': self.outfile.closeFile()
def __init__(self, module, page, keyword): config = Config() pagesize = config.getValue("pagesize", module) print( "\033[1;37;40m[*]Search Engine [%s] starting!The number of display bars per page is %s" % (module, pagesize)) myps = multiprocessing.Process(target=Collect, args=( module, page, pagesize, keyword, )) myps.start() processList = gol.get_value("process") processList.append(myps) gol.set_value("process", processList)
def getHtmlContent(target_url, header_type): config = Config() try: if header_type == 'sougou': send_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Connection': 'keep-alive', 'Cookie': 'com_sohu_websearch_ITEM_PER_PAGE=' + str(config.getValue("pagesize", "sougou")) } else: send_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Connection': 'keep-alive' } if sys.version > '3': req = urllib.request.Request(target_url, headers=send_headers) response = urllib.request.urlopen(req, timeout=10) else: req = urllib2.Request(target_url, headers=send_headers) response = urllib2.urlopen(req, timeout=30) # print get_request.info() return response.read().decode('utf-8') except Exception as e: print("Get html page content error:" + e.message)
class Filter(object): # filter_title_array = ['翻译', '词典'] def __init__(self): self.config = Config() self.filterUrlParam = self.config.getValue("filter", "filter_urlparam") self.filterUrl = self.config.getValue("filter", "filter_url") self.filterTitle = self.config.getValue("filter", "filter_title") self.filterUrlList = self.get_filterurl() self.filterTitleList = self.get_filtertitle() # Filter the real URL def filter_data(self, url, title): try: #domain = get_tld(url) urldata = tldextract.extract(url) domain = '.'.join(urldata[1:3]) except: print("解析URL:"+url+" 失败!") domain = url if self.filterUrl == 'True': if domain in self.filterUrlList: return 'filter' if self.filterTitle == 'True': for filter_titlestr in self.filterTitleList: if filter_titlestr in title: return 'filter' if self.filterUrlParam == 'True': reg = r'^https?:\/\/([a-z0-9\-\.]+)[\/\?]?' m = re.match(reg, url) if m: uri = m.groups()[0] return uri[uri.rfind('//', 0, uri.rfind('.')) + 1:] else: return url def get_filterurl(self): file_object = open('config/filter_url.txt') try: file_context = file_object.read() finally: file_object.close() return file_context def get_filtertitle(self): file_object = open('config/filter_title.txt') try: file_context = file_object.read().decode("utf-8") finally: file_object.close() return file_context