def dmmonecid(searchcid): searchcid = searchcid.replace('-', '00') searchurl = 'https://www.dmm.co.jp/digital/videoa/-/detail/=/cid={}/'.format( searchcid) html = get_html_jp(searchurl) ciddataa, notitle = ciddata(html) if ciddataa == '指定されたページが見つかりません': return ciddataa, notitle temp_out = template_cid(ciddataa) return temp_out, notitle
def dmmcid(in_q, out_q): while in_q.empty() is not True: url = in_q.get() #url = 'https://www.dmm.co.jp/digital/videoa/-/list/=/article=actress/id=1060823/' html = get_html_jp(url) list = re.findall( r'https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=([_0-9a-z]+)/', html) #print(list) out_q.append(list) in_q.task_done()
def findinfo(articleid): url = "https://www.dmm.co.jp/digital/videoa/-/list/=/article=actress/id=%s/" % articleid html = get_html_jp(url) page1 = re.findall( r'/digital/videoa/-/list/=/article=actress/id=\d+/page=(\d+)/', html) title = re.findall(r'<title>(.*) - エロ動画・アダルトビデオ - FANZA動画</title>', html) if page1 == []: page1 = 1 else: page3 = [] for i in page1: if i not in page3: page3.append(int(i)) page4 = max(page3) page1 = page4 title1 = title[0] return (page1, title1)
def prephotos(searchcid): searchurl = 'https://www.dmm.co.jp/digital/videoa/-/detail/=/cid={}/'.format( searchcid) html = get_html_jp(searchurl) soup = BeautifulSoup(html, 'lxml') photourlss = soup.find_all('img', attrs={'class': 'mg-b6'}) photourls = re.findall( r'(https://pics.dmm.co.jp/digital/video/.*?/.*?.jpg)', str(photourlss)) photolist = list(photourls) #print(photolist) jpg = [] for i in photolist: ii = list(i) ii.insert(-6, 'jp') iii = ''.join(ii) iii = iii.replace('-jp', 'jp-', 1) jpg.append(iii) return (jpg)
def dmmsearchall_data(searchstr): #url = 'https://www.dmm.co.jp/digital/videoa/-/list/search/=/?searchstr=乙白さやか' url = 'https://www.dmm.co.jp/search/=/searchstr={}/sort=rankprofile/'.format( searchstr) html = get_html_jp(url) #判断有无结果 result = re.findall(r'(に一致する商品は見つかりませんでした。)', html) noresult = 'に一致する商品は見つかりませんでした。' try: if noresult in result: stitle = 1 return (noresult, stitle) except Exception: pass soup = BeautifulSoup(html, 'lxml') searchbody = soup.find('div', attrs={'class': 'd-area'}) try: stitle = re.findall(r'<title>(.*?)</title>', html)[0] except Exception: stitle = '検索結果' boxall = searchbody.find('div', attrs={'class': 'd-sect'}) onebox = str(boxall).split('<div>') boxlist = [] for box in onebox: boxdict = {} notitle = 0 if box: try: litetitle = re.findall(r'<span class=\"txt\">(.*?)</span>', box)[0] #print(litetitle) if litetitle == None: notitle = 1 except: notitle = 1 try: cid = re.findall( r'<a href=\"https://www\.dmm\.co\.jp/.*?/cid=(\w+)/\?.*?\">', box)[0] boxdict['cid'] = cid except: boxdict['cid'] = '-' try: keywords = re.findall( r'<span class=\"ico-\w+-\w+\"><span>(.*?)</span></span>', box) keyword = ','.join(keywords) boxdict['keyword'] = keyword except: boxdict['keyword'] = '-' try: links = re.findall( r'<a href=\"(https://www\.dmm\.co\.jp/.*?-/detail/=/cid=\w+/\?.*?)\">', box)[0] boxdict['links'] = links except: boxdict['links'] = '-' try: img = re.findall(r'(pics\.dmm\.co\.jp/.*?/\w+/\w+.jpg)', box)[0] boxdict['img'] = img except Exception as e: boxdict['img'] = '-' try: title = re.findall(r'alt=\"(.*)\" src', box)[0] boxdict['title'] = title except Exception as e: boxdict['title'] = '-' try: sublinks = re.findall( r'<span><a href=\"(.*?)\">.*?</a></span>', box) boxdict['sublinks'] = sublinks[0] except Exception as e: boxdict['sublinks'] = '-' try: subtexts = re.findall( r'<span><a href=\".*?\">(.*?)</a></span>', box)[0] boxdict['subtexts'] = subtexts except: boxdict['subtexts'] = '-' if notitle == 0: #print(boxdict) boxlist.append(boxdict) return (boxlist, stitle)
def dmmlinks_data(links): #url = 'https://www.dmm.co.jp/digital/videoa/-/list/search/=/?searchstr=乙白さやか' url = links html = get_html_jp(url) #判断有无结果 soup = BeautifulSoup(html, 'lxml') searchbody = soup.find('div', attrs={'class': 'd-area'}) try: stitle = re.findall(r'<title>(.*?)</title>', html)[0] #print(stitle) except Exception: stitle = '検索結果' boxall = searchbody.find_all('li', attrs={'style': 'width: 130px;'}) onebox = str(boxall).split('</div></li>') boxlist = [] for box in onebox: boxdict = {} notitle = 0 if box: try: litetitle = re.findall(r'<span class=\"txt\">(.*?)</span>', box)[0] # print(litetitle) if litetitle == None: notitle = 1 except: notitle = 1 try: cid = re.findall(r'https://www\.dmm\.co\.jp/.*?/cid=(\w+)/', box)[0] boxdict['cid'] = cid except Exception as e: boxdict['cid'] = '-' try: keywords = re.findall( r'<span class=\"ico-\w+-\w+\"><span>(.*?)</span></span>', box) keyword = ','.join(keywords) boxdict['keyword'] = keyword except: boxdict['keyword'] = '-' try: links = re.findall(r'(https://www\.dmm\.co\.jp/.*?/cid=\w+/)', box)[0] boxdict['links'] = links except: boxdict['links'] = '-' try: img = re.findall(r'(pics\.dmm\.co\.jp/.*?/\w+/\w+.jpg)', box) boxdict['img'] = img[0] except: boxdict['img'] = '-' try: title = re.findall(r'alt=\"(.*)\" src', box) boxdict['title'] = title[0] except: boxdict['title'] = '-' try: sublinks = re.findall(r'span><a href=\"(.*?)\">.*?</a></span>', box) sublink = 'https://www.dmm.co.jp' + sublinks[0] boxdict['sublinks'] = sublink except: boxdict['sublinks'] = '-' try: subtexts = re.findall( r'<span><a href=\".*?\">(.*?)</a></span>', box) boxdict['subtexts'] = subtexts[0] except: boxdict['subtexts'] = '-' if notitle == 0: #print(boxdict) boxlist.append(boxdict) return (boxlist, stitle)