def load_index(): ''' load index ''' global movie_list version = begin_time() text = proxy_req(HOMEPAGE_URL, 3) if not len(text): if can_retry(HOMEPAGE_URL): load_index() return movie_list = re.findall('《(.*?)》', text) movie_more = re.findall('href="(.*?)">更多', text) for uri in movie_more: load_other(uri) threading_list = [threading.Thread( target=load_other, args=(ii,)) for ii in movie_another] shuffle_batch_run_thread(threading_list, 100) threading_list = [threading.Thread( target=load_other, args=(ii,)) for ii in movie_again] shuffle_batch_run_thread(threading_list, 100) # 对电影列表去重 movie_list = set(movie_list) # 导出爬取的 电影列表 out_path = 'dytt8_result.txt' with open(out_path, 'w') as f: f.write('\n'.join(movie_list)) url_num = len([*movie_more, *movie_another]) + 1 movie_num = len(movie_list) echo(1, 'Requests num: {}\nMovie num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format( url_num, movie_num, out_path, end_time(version, 0)))
def get_song_detail_thread(self): """ get song detail threadings """ version = begin_time() for classify in self.classifylist: ids = self.get_list_ids(classify) threadings = [] for oneid in ids: work = threading.Thread(target=self.get_song_detail, args=(oneid[1], )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.clean_data() self.test_song(classify, ids) self.songlist = [] self.songmap = {} self.finishlist = [] self.successtime = 0 print(classify + ' Over!') end_time(version)
def load_collect(self, page): """ load collect """ version = begin_time() if not os.path.exists('%scookie_collect' % data_dir): print('TB cookie not exist!!!') return with codecs.open('%scookie_collect' % data_dir, 'r', encoding='utf-8') as f: cookie = f.readline() changeCookie(cookie[:-1]) changeHtmlTimeout(30) for block in range(page // 10 + 1): begin = block * 10 end = min(page, (block + 1) * 10) threadings = [] for index in range(begin, end): work = threading.Thread(target=self.load_collect_once, args=(index, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() collect = [self.collect[k] for k in sorted(self.collect.keys())] collect = sum(collect, []) with codecs.open('%scollect_wyy' % data_dir, 'w', encoding='utf-8') as f: f.write("\n".join(collect)) end_time(version)
def search_goods(self): version = begin_time() if not os.path.exists('%swait' % data_dir): print('wait file not exist!!!') return with codecs.open('%swait' % data_dir, 'r', encoding='utf-8') as f: wait = f.readlines() threadings = [] for index, goods_name in enumerate(wait): work = threading.Thread(target=self.search_goods_once, args=( goods_name[:-1], index, )) threadings.append(work) for work in threadings: work.start() time.sleep(random.randint(5, 9)) for work in threadings: work.join() goods_name = [ self.goods_name[k] for k in sorted(self.goods_name.keys()) ] with codecs.open('%swait_goods' % data_dir, 'w', encoding='utf-8') as f: f.write('\n'.join(goods_name)) end_time(version)
def get_playlist_id_thread(self): """ get play list id in threading """ version = begin_time() if not len(self.classifylist): self.get_classify() for index in self.classifylist: threadings = [] for offset in range(41): work = threading.Thread(target=self.get_playlist_id, args=( index, offset * 35, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() print(len(self.playlists)) self.test_queue(index) self.playlists = [] print(index + " Over") end_time(version)
def load_goods(self): """ load goods """ version = begin_time() if not os.path.exists('%scookie' % data_dir): print('Youdao Note cookie not exist!!!') return with codecs.open('%scookie' % data_dir, 'r', encoding='utf-8') as f: cookie = f.readline() changeCookie(cookie[:-1]) threadings = [] for index, tid in enumerate(self.request_list): work = threading.Thread(target=self.load_goods_once, args=( index, tid, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() goods = [self.goods[k] for k in sorted(self.goods.keys())] goods = sum(goods, []) with codecs.open('%sgoods' % data_dir, 'w', encoding='utf-8') as f: f.write("\n".join(goods)) end_time(version)
def get_summarization(self): """ get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1 """ version = begin_time() threadings = [] for index in range(75): work = threading.Thread( target=self.summarization_once, args=(index,)) threadings.append(work) for work in threadings: # time.sleep(.5) work.start() for work in threadings: work.join() # self.text_map = self.total_map[0] # for index in list(range(1, len(self.total_map))): # for ids in self.total_map[index]: # if ids in self.text_map: # self.text_map[ids] += self.total_map[index][ids] # else: # self.text_map[ids] = self.total_map[index][ids] # print(sum(self.text_map)) word = [self.word[k] for k in sorted(self.word.keys())] with codecs.open('test', 'w', encoding='utf-8') as f: f.write("\n".join(word)) end_time(version)
def get_detail(self): """ get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1 """ version = begin_time() threadings = [] with codecs.open('bjh_href_poison.txt', 'r', encoding='utf-8') as f: href_list = f.readlines() for index, url in enumerate(href_list): work = threading.Thread( target=self.detail_once, args=(index, url,)) threadings.append(work) for work in threadings: # time.sleep(.5) work.start() for work in threadings: work.join() word_list = [self.word_list[k] for k in sorted(self.word_list.keys())] with codecs.open('bjh_detail_poison', 'w', encoding='utf-8') as f: f.write("\n".join(word_list)) self.failuredmap = {} with codecs.open('bjh.log', 'w', encoding='utf-8') as f: f.write('\n'.join(self.fail)) self.fail = [] end_time(version)
def get_summarization(self): """ get summarization from https://www.google.com.hk/search?q=%E6%AF%92%E7%8B%97%E8%82%89&newwindow=1&safe=strict&tbm=nws&ei=FK1KXJ3EJbWx0PEPytmq2AI&start=0&sa=N&ved=0ahUKEwidnv-7p4jgAhW1GDQIHcqsCis4ChDy0wMIRw&biw=1627&bih=427&dpr=2 """ version = begin_time() threadings = [] for index in range(25): work = threading.Thread( target=self.summarization_once, args=(index,)) threadings.append(work) for work in threadings: time.sleep(1) work.start() for work in threadings: work.join() summarizations = [self.summarizations[k] for k in sorted(self.summarizations.keys())] self.summarizations = sum(summarizations, []) hrefs = [self.hrefs[k] for k in sorted(self.hrefs.keys())] self.hrefs = sum(hrefs, []) with codecs.open('google_steal.txt', 'w', encoding='utf-8') as f: f.write('\n'.join(self.summarizations)) with codecs.open('google_steal_href.txt', 'w', encoding='utf-8') as f: f.write('\n'.join(self.hrefs)) end_time(version)
def goubanjia(self): """ :-1: html tag mixed with invalid data :100:And the most important thing is the port writed in 'class' rather in text. The website is difficult to spider, but the proxys are very goog goubanjia proxy http://www.goubanjia.com """ version = begin_time() host = "http://www.goubanjia.com" html = self.proxy_req(host, 0) if not html: return [] trs = html.find_all("tr", class_=["warning", "success"]) for tr in trs: tds = tr.find_all("td") ip = tds[2].find_all("a")[0].text + "://" iplist = tds[0].find_all(["div", "span", not "p"], class_=not "port") for index in iplist: ip += index.text encode = tds[0].find_all(["div", "span", "p"], class_="port")[0]["class"][1] uncode = functools.reduce( lambda x, y: x * 10 + (ord(y) - ord("A")), map(lambda x: x, encode), 0) self.waitjudge.append(ip + ":" + str(int(uncode / 8))) self.thread_judge() end_time(version, 2)
def goubanjia(self): """ :-1: html tag mixed with invalid data :100:And the most important thing is the port writed in 'class' rather in text. The website is difficult to spider, but the proxys are very goog goubanjia proxy http://www.goubanjia.com """ version = begin_time() host = 'http://www.goubanjia.com' html = self.proxy_req(host, 0) if not html: return [] trs = html.find_all('tr', class_=['warning', 'success']) for tr in trs: tds = tr.find_all('td') ip = tds[2].find_all('a')[0].text + '://' iplist = tds[0].find_all(['div', 'span', not 'p'], class_=not 'port') for index in iplist: ip += index.text encode = tds[0].find_all(['div', 'span', 'p'], class_='port')[0]['class'][1] uncode = functools.reduce( lambda x, y: x * 10 + (ord(y) - ord('A')), map(lambda x: x, encode), 0) self.waitjudge.append(ip + ':' + str(int(uncode / 8))) self.threadjude() end_time(version)
def participles_word(self): """ participles word """ version = begin_time() for file in self.filelists: pkuseg.test(file, file[:-4] + '_pkuseg.txt', model_name='../Model_retrieval/pkuseg', nthread=20) end_time(version)
def pre_data_list(self, do_pre): version = begin_time() if do_pre == True: self.load_all(0) self.load_all(1) elif do_pre == 2: self.load_all_pickle(0) self.load_all_pickle(1) else: self.load_basic(1) end_time(version)
def get_movie_lists(self): ''' get movie list ''' version = begin_time() movie_get = [] for kk in range(0, 1100, 100): for jj in self.sort_list: for ii in self.tag_movie: movie_get.append(threading.Thread( target=self.get_movie_lists_once, args=('movie', ii, jj, kk,))) for ii in self.tag_tv: movie_get.append(threading.Thread( target=self.get_movie_lists_once, args=('tv', ii, jj, kk,))) shuffle_batch_run_thread(movie_get, 500, True) again_list = [threading.Thread(target=self.get_movie_lists_once, args=( ii[0], ii[1], ii[2], ii[3],)) for ii in self.again_list] shuffle_batch_run_thread(again_list, 500, True) self.again_list = [] echo(1, len(self.movie_id2name.keys())) changeHtmlTimeout(40) movie_get = [] tag_categories = self.tag_categories for mm in range(0, 10000, 1000): for tags in tag_categories[0][1:]: for genres in tag_categories[1][1:]: for ii, jj in self.yearMap.values(): year_range = '{},{}'.format(ii, jj) for sorts in self.tabs: movie_get.append(threading.Thread( target=self.get_movie_list_from_tabs, args=(sorts, tags, genres, year_range, mm,))) echo(2, 'Thread Num:', len(movie_get)) shuffle_batch_run_thread(movie_get, 900, True) again_list = [threading.Thread(target=self.get_movie_list_from_tabs, args=( ii[0], ii[1], ii[2], ii[3], ii[4],)) if len(ii) == 5 else threading.Thread(target=self.get_movie_lists_once, args=( ii[0], ii[1], ii[2], ii[3],)) for ii in self.again_list] shuffle_batch_run_thread(again_list, 900, True) time.sleep(120) changeJsonTimeout(10) for ii in self.rank_list: self.get_movie_rank(ii, 0) if ii == 'top250': self.get_movie_rank(ii, 100) self.get_movie_rank(ii, 200) movie_list = self.movie_id2name.keys() output_path = '{}douban_movie_id'.format(data_dir) with open(output_path + '.txt', 'w') as f: f.write('\n'.join([str(ii) for ii in movie_list])) dump_bigger(self.movie_id2name, output_path + '.pkl') movie_num = len(movie_list) echo(1, 'Movie num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format( movie_num, output_path, end_time(version, 0)))
def load_proxies_test(self): """ load mode & test proxies """ version = begin_time() self.load_proxies_list() proxies_len = len(self.waitjudge) self.thread_judge() canuse_len = len(self.canuse_proxies) echo( "1|info", "\nTotal Proxies num: {}\nCan use num: {}\nTime spend: {}\n". format(proxies_len, canuse_len, end_time(version)), ) with open("{}canuse_proxies.txt".format(data_dir), "w") as f: f.write("\n".join(self.canuse_proxies))
def have_places(self): """ brush class """ version = begin_time() have_places = False while not have_places: if self.have_places_once(): send_email('大数据专题', '大数据专题 有名额啦 有名额啦') send_email('大数据专题', '大数据专题 有名额啦 有名额啦') send_email('大数据专题', '大数据专题 有名额啦 有名额啦') have_places = True time.sleep(random.randint(10, 20)) end_time(version)
def press_threading(self, url, qps, types): """ press url at constant qps """ version = begin_time() threadings = [] for index in range(qps): work = threading.Thread( target=self.basic_press, args=(url, 0, types)) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() end_time(version)
def kuaidaili(self, page: int): """ kuaidaili https://www.kuaidaili.com/free/ """ version = begin_time() threadings = [] for index in range(1, page + 1): work = threading.Thread(target=self.kuaidailithread, args=(index,)) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.thread_judge() end_time(version, 2)
def test_db(self, types: int): """ test proxy in db can use """ version = begin_time() typestr = "" if types == 2: typestr = "(0,1,2,3)" elif types == 1: typestr = "(1,3)" else: typestr = "(0,2)" results = self.Db.select_db(self.select_all % typestr) if results: for index in results: self.waitjudge.append(index[0]) self.thread_judge() self.init_proxy() end_time(version, 2)
def sixsixip(self, area: int, page: int): """ 66ip proxy http://www.66ip.cn/areaindex_{area}/{page}.html """ version = begin_time() threadings = [] for index in range(1, area + 1): for pageindex in range(1, page + 1): echo("2|debug", "{} {}".format(index, pageindex)) work = threading.Thread(target=self.sixsixthread, args=(index, pageindex)) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.thread_judge() end_time(version, 2)
def sixsixip(self, area, page): """ 66ip proxy http://www.66ip.cn/areaindex_{area}/{page}.html """ version = begin_time() threadings = [] for index in range(1, area + 1): for pageindex in range(1, page + 1): echo(2, str(index) + ' ' + str(pageindex)) work = threading.Thread(target=self.sixsixthread, args=(index, pageindex)) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.threadjude() end_time(version)
def get_movie_lists(self): ''' get movie list ''' version = begin_time() movie_get = [] for ii in self.tag: for jj in self.sort_list: movie_get.append(threading.Thread( target=self.get_movie_lists_once, args=('movie', ii, jj, 0,))) for ww in movie_get: ww.start() for ww in movie_get: ww.join() movie_list = set(sum(self.movie_id_dict.values(), [])) output_path = '{}douban_movie_id.txt'.format(data_dir) with open(output_path, 'w') as f: f.write('\n'.join([str(ii) for ii in movie_list])) movie_num = len(movie_list) echo(1, 'Movie num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format( movie_num, output_path, end_time(version, 0)))
def data5u(self): """ data5u proxy http://www.data5u.com/ no one can use """ version = begin_time() url_list = ["", "free/gngn/index.shtml", "free/gwgn/index.shtml"] host = "http://www.data5u.com/" for uri in url_list: html = self.proxy_req(host + uri, 0) if not html: continue table = html.find_all("ul", class_="l2") for index in table: tds = index.find_all("li") ip = tds[3].text self.waitjudge.append("{}://{}:{}".format(ip, tds[1].text, tds[2].text)) self.thread_judge() end_time(version, 2)
def data5u(self): """ data5u proxy http://www.data5u.com/ no one can use """ version = begin_time() url_list = ['', 'free/gngn/index.shtml', 'free/gwgn/index.shtml'] host = 'http://www.data5u.com/' for uri in url_list: html = self.proxy_req(host + uri, 0) if not html: continue table = html.find_all('ul', class_='l2') for index in table: tds = index.find_all('li') ip = tds[3].text self.waitjudge.append(ip + '://' + tds[0].text + ':' + tds[1].text) self.threadjude() end_time(version)
def xici_proxy(self, page: int): """ xici proxy http://www.xicidaili.com/nn/{page} The first proxy I use, but now it can not use it mostly. """ if not str(page).isdigit(): echo("0|warning", "Please input num!") return [] version = begin_time() url = "http://www.xicidaili.com/nn/%d" for index in range(1, page + 1): html = basic_req(url % index, 0) tem = html.find_all("tr") for index in range(1, len(tem)): tds = tem[index].find_all("td") ip = tds[5].text.lower() self.waitjudge.append("{}://{}:{}".format(ip, tds[1].text, tds[2].text)) self.thread_judge() end_time(version, 2)
def parse_detail(self, hotel_id: int = 4889292): ''' parse hotel detail ''' version = begin_time() # self.user_action(hotel_id) # self.generate_cookie(hotel_id) # self.prepare_req() text = self.get_hotel_detail(hotel_id) html = BeautifulSoup(text['html'], 'html.parser') trs = html.findAll('tr')[2:] hotel_detail = [] for tr in trs: room_name = re.findall('baseroomname="(.*?)"', str(tr)) if not len(room_name): room_name = re.findall('l="nofollow">\n(.*?)\n', str(tr)) room_name = room_name[0].strip() if len(room_name) else ( hotel_detail[-1][0] if len(hotel_detail) else '') price = re.findall(r'</dfn>(\d{4,5}?)</span>', str(tr)) if not len(price): continue sales_price_list = re.findall(r'促销优惠减(.*?)</span>', str(tr)) sales_price = sales_price_list[0] if len(sales_price_list) else '' price_type = re.findall('room_type_name">(.*?)</span>', str(tr))[0] if 'em' in price_type: price_type = ','.join([ *re.findall('(.*?)<em', price_type), *re.findall('((.*?))', price_type) ]) hotel_detail.append([room_name, price_type, price[0], sales_price]) output_dir = '{}hotelDetail.txt'.format(data_dir) with open(output_dir, 'w') as f: f.write('\n'.join([','.join(ii) for ii in hotel_detail])) echo( 1, 'Hotel: {}\nLoad {} price\nOutput path: {}\nSpend time: {:.2f}s'. format(hotel_id, len(hotel_detail), output_dir, end_time(version, 0))) return hotel_detail
def get_href(self): """ get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1 """ version = begin_time() threadings = [] for index in range(71): work = threading.Thread( target=self.href_once, args=(index,)) threadings.append(work) for work in threadings: # time.sleep(.5) work.start() for work in threadings: work.join() href_map = [self.href_map[k] for k in sorted(self.href_map.keys())] self.href_map = sum(href_map, []) with codecs.open('bjh_href_poison.txt', 'w', encoding='utf-8') as f: f.write("\n".join(self.href_map)) end_time(version)
def testdb(self, types): ''' test proxy in db can use ''' version = begin_time() typestr = '' if types == 2: typestr = '(0,1,2,3)' elif types == 1: typestr = '(1,3)' else: typestr = '(0,2)' results = self.Db.select_db(self.select_all % typestr) if results != 0: for index in results: self.waitjudge.append(index[0]) self.threadjude() else: pass self.initproxy() end_time(version)
def xiciproxy(self, page): """ xici proxy http://www.xicidaili.com/nn/{page} The first proxy I use, but now it can not use it mostly. """ if not str(page).isdigit(): echo(0, "Please input num!") return [] version = begin_time() url = 'http://www.xicidaili.com/nn/%d' for index in range(1, page + 1): html = basic_req(url % (index), 0) tem = html.find_all('tr') for index in range(1, len(tem)): tds = tem[index].find_all('td') ip = tds[5].text.lower() self.waitjudge.append(ip + '://' + tds[1].text + ':' + tds[2].text) self.threadjude() end_time(version)
def build_md(self, load_img=False): """ build md """ version = begin_time() threadings = [] for index, tid in enumerate(self.request_list): work = threading.Thread(target=self.build_md_once, args=( index, tid, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() if not load_img: return img_map = {k: self.img_map[k] for k in sorted(self.img_map.keys())} img_threadings = [] for index in img_map.keys(): for img_id, img_url in enumerate(img_map[index]): work = threading.Thread(target=self.load_img, args=( index, img_id, img_url, )) img_threadings.append(work) for work in img_threadings: work.start() for work in img_threadings: work.join() end_time(version)