def __init__(self, url): self._result = [] self._uid = "1000" self._bid = "28e05cbd" self._cid = str(time.time()).split('.')[0] self._did = "4509435" self._url = url getter = GetHtml() getter.set(self._url) self._data = getter.get() self._process()
def download_img_list(img_dir, img_url, page): print("Start download", img_dir) test = GetImgList(img_url) s = 0 img_getter = GetHtml() for i in test: print('Page:', page, '**', img_dir, 'Now downloading', s + 1, "/", len(test)) img_getter.set(str(i)) fs = open(img_dir + '\\' + str(s) + '.jpg', 'wb') fs.write(img_getter.get()) fs.close() s = s + 1
def run(self): # Collect Users html = GetHtml(self.url) #get id level certificated mainpage portrait nikename address follows users = re.findall(r'divEditOperate_(\d*?)\".*?<span class=\"mainColor weight700\">.*?>(.*?)</span></span>(<br/>)?.*?href=(.*?) hidefocus.*?src=\"(.*?)\".*?alt=(.*?) title=.*?<p class="font12 lesserColor">(.*?) 粉丝 ><span class="font12 mainColor">(\d*?)</span', html, re.S) #write to database for user in users: if user[2] == "": self.Write2database(user[0], user[1], "False", "http://www.moko.cc" + user[3], user[4], user[5], user[6], user[7]) else: self.Write2database(user[0], user[1], "True", "http://www.moko.cc" + user[3], user[4], user[5], user[6], user[7])
def run(self): while Producer_Flag or self.url_list != []: if self.url_list != []: html = GetHtml(self.url_list[0]) while html == None: html = GetHtml(self.url_list[0]) url = self.url_list[0] del(self.url_list[0]) try: bs = BeautifulSoup(html,'html.parser')#standard library title = self.getTitle(bs) content = self.getContent(bs) print(" [+] Spider{0}: {1} parsed".format(self.id, title)) writer.write(url, title, content) except Exception as e: print(" [-] Spider{0}: {1}\n\twhen parsing {2}".format(self.id, e, url)) #parse again self.url_list.insert(0, url) print(" [-] Spider {0} finished...".format(self.id))
def run(self): html = GetHtml(self.url, timeout=3) while html == None: html = GetHtml(self.url, timeout=4) if webPage == "https://qxs.la/": chapters = re.findall(r'<div class=\"chapter\">.*?<a href=\"(.*?)\"', html, re.S) self.start_url = self.start_url[14:] self.end_url = self.end_url[14:] start = chapters.index(self.start_url) end = chapters.index(self.end_url) + 1 chapters=chapters[start:end] for s in chapters: url = "https://qxs.la" + s writer.addUrl(url) #give the url to the most free one minn = 6666 minid = -1 for consumer in consumer_list: if consumer.GetRemainedTasks() < minn: minn = consumer.GetRemainedTasks() minid = consumer.GetId() consumer_list[minid].addUrl(url) elif webPage == "http://www.clewx.com/": #没有re.S会单行匹配,有re.S会多行匹配 chapters = re.findall(r'<dd><a href=\"(.*?)\" title', html) print(chapters) start = chapters.index(self.start_url) end = chapters.index(self.end_url) + 1 chapters=chapters[start:end] for url in chapters: writer.addUrl(url) #give the url to the most free one minn = 6666 minid = -1 for consumer in consumer_list: if consumer.GetRemainedTasks() < minn: minn = consumer.GetRemainedTasks() minid = consumer.GetId() consumer_list[minid].addUrl(url) elif webPage == "http://www.k6uk.com/":
base_t_url = "https://l.bdear.xyz/" find_tid = re.compile(r'<a href="thread-.*?</a>') re_tid = re.compile(r'thread-.*?/') re_title = re.compile(r'>.*?<') if (not os.path.exists('.\\Download')): os.mkdir('.\\Download') threads = [] t_sum = 0 m_sum = 0 for i in range(int(start), int(end) + 1): url = base_url + str(i) + '.html' ht = GetHtml() ht.set(url) html = str(ht.get().decode('utf-8', 'ignore')) all_tid = find_tid.findall(html) for data in all_tid: if (re_title.search(data) != None): tid = re_tid.search(data).group(0).split('/')[0] title = re_title.search(data).group( 0).split('>')[1].split('<')[0].replace('.', '').replace( ':', '-').replace('/', '-').replace('?', '-') if (title.isdigit()): continue else: img_dir = '.\\Download\\' + title
def GetUserPages(url): html = GetHtml(url) pageNums = re.findall(r'onfocus=\"this\.blur\(\)\">(\d*?)<', html, re.S) pageNum = max(pageNums) return int(pageNum)