def girls_movie(scode, maxpage=1): um = UrlManager() um.update() source_urls = um.show() url_to_scray = source_urls[0].url + '/star/' + scode print('url is ' + url_to_scray) html = downloader.get_html(url_to_scray) soup = BeautifulSoup(html, "html.parser") results = [] nextpageurl = 'first_loop' page_counter = 0 while nextpageurl != None: if nextpageurl != 'first_loop': # html = downloader.get_html(nextpageurl) soup = BeautifulSoup(html, "html.parser") for url in pageparser.parser_homeurl(html): results.append(url) nextpageurl = pageparser.get_next_page_url(source_urls[0].url, html) page_counter += 1 print('page => ' + str(page_counter)) if page_counter == maxpage: break fcodes = [] for url in results: print(url) code = (url[len(source_urls[0].url):]).strip('/') fcodes.append(code) return fcodes
def main(entrance): start_time = time.time() print "entrance:{}".format(entrance) entrance_html = downloader.get_html(entrance) teams_url_list = pageparser.get_teams(entrance_html) for x in teams_url_list: print 'spider to url:{}'.format(x) team_id = int(x.split('/')[-1]) team_html = downloader.get_html(x) try: team_name,country,create_year,league,coach,city,match_place,website,intro,img_url = pageparser.get_team_info(team_html) schedule,match_time,home_team,market,visit_team,score,half_score,result,market_trend,bet_type1,bet_type2,goal_number = pageparser.get_team_history_match(team_id,team_html) controler.write_team_data(team_id,team_name,country,create_year,league,coach,city,match_place,website,intro,img_url) print 'running time:{}s'.format(str(time.time()-start_time)) except Exception as e: print e with open('fail_url.txt', 'a') as fd: fd.write('{}\n{}'.format(e,x)) continue
def get_dict(url): """get the dict of the detail page and yield the dict""" url_html = downloader.get_html(url) for detail_url in pageparser.parser_homeurl(url_html): detail_page_html = downloader.get_html(detail_url) dict_jav = pageparser.parser_content(detail_page_html) #重写url dict_jav['URL'] = detail_url yield dict_jav, detail_url
def main(entrance): #无码为1,有码为0 is_censored = 0 if 'uncensored' in entrance else 1 page_crawler(entrance, is_censored) entrance_html = downloader.get_html(entrance) next_page_url = pageparser.get_next_page_url(entrance, entrance_html) while next_page_url: page_crawler(next_page_url,is_censored) next_page_html = downloader.get_html(next_page_url) next_page_url = pageparser.get_next_page_url(entrance, next_page_html)
def get_dict(url): """get the dict of the detail page and yield the dict""" url_html = downloader.get_html(url) for detail_url in pageparser.parser_homeurl(url_html): try: detail_page_html = downloader.get_html(detail_url) dict_jav = pageparser.parser_content(detail_page_html) except: with open('fail_url.txt', 'a') as fd: fd.write('%s\n' % detail_url) print("Fail to crawl %s\ncrawl next detail page......" % detail_url) continue yield dict_jav, detail_url
def main(entrance): #创建数据表 controler.create_db() #无码为1,有码为0 is_uncensored = 1 if 'uncensored' in entrance else 0 join_db(entrance, is_uncensored) entrance_html = downloader.get_html(entrance) next_page_url = pageparser.get_next_page_url(entrance, entrance_html) while True: if next_page_url: join_db(next_page_url, is_uncensored) next_page_html = downloader.get_html(next_page_url) next_page_url = pageparser.get_next_page_url(entrance, next_page_html) if next_page_url == None: break
def parser_content(html): """parser_content(html),parser page's content of every url and yield the dict of content""" soup = BeautifulSoup(html, "html.parser") categories = {} code_name_doc = soup.find('span', text="識別碼:") code_name = code_name_doc.parent.contents[2].text if code_name_doc else '' categories['Video_ID'] = code_name date_issue_doc = soup.find('span', text="發行日期:") date_issue = date_issue_doc.parent.contents[1].strip( ) if date_issue_doc else '' categories['Release_Date'] = date_issue duration_doc = soup.find('span', text="長度:") duration = duration_doc.parent.contents[1].strip() if duration_doc else '' categories['Length'] = re.match(r"\d+", duration)[0] manufacturer_doc = soup.find('span', text="製作商:") manufacturer = manufacturer_doc.parent.contents[ 2].text if manufacturer_doc else '' categories['Producer'] = manufacturer series_doc = soup.find('span', text="系列:") series = series_doc.parent.contents[2].text if series_doc else '' categories['Series'] = series genre_doc = soup.find('p', text="類別:") genre = ( i.text.strip() for i in genre_doc.find_next('p').select('span')) if genre_doc else '' genre_text = '' for tex in genre: genre_text += '%s ' % tex categories['Label'] = genre_text actors = soup.select('span[onmouseover^="hoverdiv"]') list_actor = parser_actor(actors) categories['Actors'] = list_actor url = soup.select('link[hreflang="zh"]')[0]['href'] categories['URL'] = url magnet_html = downloader.get_html(get_cili_url(soup), Referer_url=url) magnet = parser_magnet(magnet_html) categories['Magnet'] = magnet # publisher_doc = soup.find('span', text="發行商:") # publisher = publisher_doc.parent.contents[2].text if publisher_doc else '' # categories['發行商'] = publisher # director_doc = soup.find('span', text="導演:") # director = director_doc.parent.contents[2].text if director_doc else '' # categories['導演'] = director return categories
def get_sub_areas(city): url_prefix = "https://www.rew.ca/sitemap/real-estate/" city_url = url_prefix + city html = downloader.get_html(city_url) soup = BeautifulSoup(html, "html.parser") sub_areas = soup.select(".gridblock-link") if sub_areas: for sub_area in sub_areas: yield sub_area.get('href').split('/')[3] else: yield city
def get_list(province): url_prefix = "https://www.rew.ca/sitemap/real-estate/" province_url = url_prefix + province html = downloader.get_html(province_url) soup = BeautifulSoup(html, "html.parser") areas = soup.select(".gridblock-link") for area in areas: for i in get_sub_areas(area.get('href').split('/')[3]): print(i) with open(province + '_sub_area_list.txt', 'a') as fd: fd.write('%s\n' % i)
def subscribe_girl(scode): gs = session.query(Girl).filter(Girl.scode == scode).all() if len(gs) == 0: uu = UrlManager().show()[0].url + '/star/' + scode html = downloader.get_html(uu) name = pageparser.parser_girlurl(html) session.add(Girl(name=str(name), scode=scode, date=now())) print(str(name) + 'subscribed!') session.commit() session.close() else: print('this girl already subscribed!')
def main(entrance): print "entrance:{}".format(entrance) entrance_html = downloader.get_html(entrance) specialty_code_spe_dict,specialty_code_url_dict = pageparser.get_specialty(entrance_html) for code in specialty_code_url_dict: title = specialty_code_spe_dict[code] url = specialty_code_url_dict[code] print title,code,url spe_url = main_url + url spe_html = downloader.get_html(spe_url) detail = pageparser.get_specialtyDetail(spe_html) subject = detail['subject'] class_ = detail['class'] name = detail['name'] intro = detail['intro'] controler.write_data(title, code,subject, class_, name, intro)
def main(entrance_url): current_page = downloader.get_html(entrance_url) urls = htmlparser.get_urls(current_page) for url in urls: if not database.check_url_in_db(url): time.sleep(3) print("Working on " + url) html = downloader.get_html(url) property_dict = htmlparser.parse_content(html) if property_dict: property_dict['url'] = url database.write_data(property_dict) else: print("Nothing here " + url) # else: # print("Already have it " + url) print("Done with " + entrance_url) next_page_url = htmlparser.get_next_page_urls(current_page) if next_page_url: main(next_page_url)
def run(self): # apply(self.target, self.args) while not self.task_q.full(): self.task_q.put(0) if lock.acquire(): url = self.urlmgr.get_one() lock.release() if url: con = downloader.get_html(url) self.res_q.put(self.parse_content_fn(con)) if lock.acquire(): if parse_url_fn: self.urlmgr.add_url(None, self.parse_url_fn(con)) self.urlmgr.fin_url(url) lock.release() self.task_q.get() self.task_q.task_done()
def parser_content(html): """parser_content(html),parser page's content of every url and yield the dict of content""" soup = BeautifulSoup(html, "html.parser") categories = {} code_name_doc = soup.find('span', text="識別碼:") code_name = code_name_doc.parent.contents[2].text if code_name_doc else '' categories['識別碼'] = code_name #code_name = soup.find('span', text="識別碼:").parent.contents[2].text if soup.find('span', text="識別碼:") else '' date_issue_doc = soup.find('span', text="發行日期:") date_issue = date_issue_doc.parent.contents[1].strip( ) if date_issue_doc else '' categories['發行日期'] = date_issue #date_issue = soup.find('span', text="發行日期:").parent.contents[1].strip() if soup.find('span', text="發行日期:") else '' duration_doc = soup.find('span', text="長度:") duration = duration_doc.parent.contents[1].strip() if duration_doc else '' categories['長度'] = duration #duration = soup.find('span', text="長度:").parent.contents[1].strip() if soup.find('span', text="長度:") else '' director_doc = soup.find('span', text="導演:") director = director_doc.parent.contents[2].text if director_doc else '' categories['導演'] = director #director = soup.find('span', text="導演:").parent.contents[2].text if soup.find('span', text="導演:") else '' manufacturer_doc = soup.find('span', text="製作商:") manufacturer = manufacturer_doc.parent.contents[ 2].text if manufacturer_doc else '' categories['製作商'] = manufacturer #manufacturer = soup.find('span', text="製作商:").parent.contents[2].text if soup.find('span', text="製作商:") else '' publisher_doc = soup.find('span', text="發行商:") publisher = publisher_doc.parent.contents[2].text if publisher_doc else '' categories['發行商'] = publisher #publisher = soup.find('span', text="發行商:").parent.contents[2].text if soup.find('span', text="發行商:") else '' series_doc = soup.find('span', text="系列:") series = series_doc.parent.contents[2].text if series_doc else '' categories['系列'] = series #series = soup.find('span', text="系列:").parent.contents[2].text if soup.find('span', text="系列:") else '' genre_doc = soup.find('p', text="類別:") genre = ( i.text.strip() for i in genre_doc.find_next('p').select('span')) if genre_doc else '' #genre =(i.text.strip() for i in soup.find('p', text="類別:").find_next('p').select('span')) if soup.find('p', text="類別:") else '' genre_text = '' for tex in genre: genre_text += '%s ' % tex categories['類別'] = genre_text actor_doc = soup.select('span[onmouseover^="hoverdiv"]') actor = (i.text.strip() for i in actor_doc) if actor_doc else '' #actor = (i.text.strip() for i in soup.select('span[onmouseover^="hoverdiv"]')) if soup.select('span[onmouseover^="hoverdiv"]') else '' actor_text = '' for tex in actor: actor_text += '%s ' % tex categories['演員'] = actor_text #网址加入字典 url = soup.select('link[hreflang="zh"]')[0]['href'] categories['URL'] = url #将磁力链接加入字典 magnet_html = downloader.get_html(_get_cili_url(soup), Referer_url=url) magnet = _parser_magnet(magnet_html) categories['磁力链接'] = magnet return categories