def get_brief(mcs, code): url = "https://avsox.host/cn/search/" + code rsp = requests.get(url, proxies=proxy) html = rsp.text match = re.search(mcs.__url_pattern, html) if not match: return None url = match.group(1) rsp = requests.get(url, proxies=proxy) html = rsp.text bs = bs4.BeautifulSoup(html, "lxml") movie = bs.select(".movie")[0] brief = Brief() brief.code = code img = movie.select(".screencap", limit=1)[0].a.img brief.title = img.attrs["title"] brief.release_date = noexcept( lambda: re.search(mcs.__release_date_pattern, str(movie)).group(1), "") brief.actress = ", ".join(x.text for x in bs.select( "#avatar-waterfall", limit=1)[0].find_all("span")) rsp = requests.get(img.attrs["src"], proxies=proxy) if 300 <= rsp.status_code <= 400: if "location" in rsp.headers: brief.preview_img_url = rsp.headers["location"] elif rsp.status_code == 200: brief.preview_img_url = img.attrs["src"] return brief
def get_brief_from_a_card(card_tag): release_date, _ = try_evaluate(lambda: datetime.datetime.strptime( re.search(r"\d\d\d\d-\d\d-\d\d", card_tag.text).group(0), "%Y-%m-%d")) actress = list( map( lambda x: x.text, card_tag.find_all(name="a", attrs={"class": "btn-danger"}), )) img, _ = try_evaluate( lambda: card_tag.find(name="img").attrs["data-src"]) if not img.startswith("http"): img = "http:" + img brief = Brief() brief.preview_img_url = img brief.title, _ = try_evaluate( lambda: card_tag.find(name="h5").text.strip(), "") brief.actress = ", ".join(actress) brief.release_date = release_date brief.code = card_tag.find(name="h4").text.strip() return brief
def get_brief(mcs, code): html = mcs.__client.get( "http://www.javlibrary.com/ja/vl_searchbyid.php?keyword=" + code, proxies=proxy).text match = re.search(r"\"og:url\" content=\"//(.+?)\">", html) if not match: # like JUFE-114 bs = bs4.BeautifulSoup(html, "lxml") url = "http://www.javlibrary.com/ja" + bs.select( ".video")[0].a.attrs['href'][1:] rsp = mcs.__client.get(url, proxies=proxy) match = re.search(r"\"og:url\" content=\"//(.+?)\">", rsp.text) if not match: return None url = match.group(1) html = mcs.__client.get("http://" + url, proxies=proxy).text brief = Brief() bs = bs4.BeautifulSoup(html, "lxml") brief.title = bs.select(".post-title")[0].text brief.preview_img_url = bs.select("#video_jacket_img")[0].attrs["src"] if not brief.preview_img_url.startswith("http"): brief.preview_img_url = "http:" + brief.preview_img_url brief.code = code date = bs.select("#video_date")[0].select("td")[-1].text brief.release_date = date brief.actress = ", ".join( (span.text for span in bs.select("#video_cast")[0].select(".star") )) # like AQSH-035 return brief
def get_brief(cls, code): url = "https://avsox.net/cn/search/" + code rsp = requests.get(url) html = rsp.text url = re.search(cls.__url_pattern, html).group(1) rsp = requests.get(url) html = rsp.text bs = bs4.BeautifulSoup(html, "lxml") movie = bs.select(".movie")[0] brief = Brief() brief.code = code img = movie.select(".screencap", limit=1)[0].a.img brief.title = img.attrs['title'] brief.set_release_date( try_evaluate( lambda: re.search(cls.__release_date_pattern, str(movie)). group(1), "")[0]) brief.actress = ", ".join(x.text for x in bs.select( "#avatar-waterfall", limit=1)[0].find_all('span')) rsp = requests.get(img.attrs['src']) if 300 <= rsp.status_code <= 400: if "location" in rsp.headers: brief.preview_img_url = rsp.headers['location'] elif rsp.status_code == 200: brief.preview_img_url = img.attrs['src'] return brief
def __get_brief_from_card(card): code = card.select(".tag.is-link.is-light")[0].text.strip() actress = ", ".join( (x.text.strip() for x in card.select(".tag.is-primary.is-light"))) h5 = card.select(".title")[0] title = h5.text.strip() img = noexcept(lambda: h5.a.attrs["rel"][0]) release_date = card.select("footer")[0].p.text.strip() brief = Brief() brief.title = title brief.preview_img_url = img brief.code = code brief.actress = actress brief.release_date = release_date return brief
def __get_brief_by_card(card): columns = card.select(".column") code = columns[4].next.strip() actress = ", ".join( (x.text.strip() for x in columns[2].find_all(name="span"))) title = columns[3].text.strip() img, _ = try_evaluate(lambda: columns[3].a.attrs["rel"][0]) release_date = columns[1].text.strip() brief = Brief() brief.title = title brief.preview_img_url = img brief.code = code brief.actress = actress brief.set_release_date(release_date) return brief
def get_brief(cls, code): html = cls.__client.get("http://www.javlibrary.com/ja/vl_searchbyid.php?keyword=" + code).text match = re.search(r"\"og:url\" content=\"//(.+?)\">", html) if not match: return None url = match.group(1) html = cls.__client.get("http://" + url).text brief = Brief() bs = bs4.BeautifulSoup(html, "lxml") brief.title = bs.select(".post-title")[0].text brief.preview_img_url = bs.select("#video_jacket_img")[0].attrs['src'] if not brief.preview_img_url.startswith("http"): brief.preview_img_url = "http:" + brief.preview_img_url brief.code = code date = bs.select("#video_date")[0].select("td")[-1].text brief.set_release_date(date) brief.actress = ", ".join((span.text for span in bs.select(".cast"))) return brief
def __get_brief_by_card(card): columns = card.select(".column") if not columns: # like 飯岡かなこ return None code = columns[4].next.strip() actress = ", ".join( (x.text.strip() for x in columns[2].find_all(name="span"))) title = columns[3].text.strip() img = noexcept(lambda: columns[3].a.attrs["rel"][0]) release_date = columns[1].text.strip() brief = Brief() brief.title = title brief.preview_img_url = img brief.code = code brief.actress = actress brief.release_date = release_date return brief
def get_brief(mcs, code): url = "http://warashi-asian-pornstars.fr/en/s-12/search" payload = "recherche_critere=v&recherche_valeur=" + code headers = {'content-type': "application/x-www-form-urlencoded"} response = requests.post(url, data=payload, headers=headers, proxies=proxy) bs = bs4.BeautifulSoup(response.text, "lxml") div = bs.select(".resultat-film")[0] url = "http://warashi-asian-pornstars.fr" + div.a.attrs['href'] response = requests.get(url) bs = bs4.BeautifulSoup(response.text, "lxml") div = bs.select("#fiche-film-infos")[0] ps = div.find_all(name='p') brief = Brief() brief.preview_img_url = "http://warashi-asian-pornstars.fr" + bs.select( 'video')[0].attrs["poster"] brief.code = code for p in ps: text = p.text if ":" not in text: continue tokens = text.split(":") if len(tokens) != 2: continue k, v = tokens if k == "original title": brief.title = v.strip() if k == "release date": brief.release_date = datetime.datetime.strptime( v.strip(), "%B %d, %Y") div = bs.select("#casting-f")[0] brief.actress = ",".join((p.text for p in div.select(".ja"))) return brief
def get_brief_from_a_card(card_tag): release_date, _ = try_evaluate(lambda: datetime.datetime.strptime( re.search(r"\d\d\d\d-\d\d-\d\d", card_tag.text).group(0), "%Y-%m-%d")) actress = list( map(lambda x: x.text, card_tag.find_all(name='a', attrs={'class': 'btn-danger'}))) img, _ = try_evaluate(lambda: card_tag.find(name='img').attrs['src']) if not img.startswith("http:"): img = "http:" + img brief = Brief() brief.preview_img_url = img brief.title, _ = try_evaluate( lambda: card_tag.find(name='h5').text.strip(), "") brief.actress = ", ".join(actress) brief.set_release_date(release_date) brief.code = card_tag.find(name='h4').text.strip() return brief
def __get_brief_by_box(box): code = box.find(name='span', attrs={'class': 'video_id'}).text div = box.find(name='div', attrs={'class': 'col-sm-7'}) actress = ", ".join( map(lambda x: x.text, div.find_all(name='div', attrs={'class': 'col-xs-6'}))) title = div.find(name='span', attrs={'class': 'video_title'}).text img, _ = try_evaluate( lambda: div.find(name='span', attrs={ 'class': 'preview_btn' }).attrs['rel']) release_date = box.find(name='div', attrs={ 'class': 'col-sm-2' }).span.text brief = Brief() brief.title = title.strip() brief.preview_img_url = img brief.code = code.strip() brief.actress = actress.strip() brief.set_release_date(release_date) return brief