def dmmcid(in_q, out_q): while in_q.empty() is not True: url = in_q.get() html = get_html_jp(url) list = re.findall( r'https://www\.dmm\.co\.jp/digital/videoa/-/detail/=/cid=([_0-9a-z]+)/', html) #print(list) out_q.append(list) in_q.task_done()
def dmmonecid(searchcid): searchcid = searchcid.replace('-', '00') searchurl = 'https://www.dmm.co.jp/digital/videoa/-/detail/=/cid={}/'.format( searchcid) html = get_html_jp(searchurl) ciddataa, notitle = ciddata(html) if ciddataa == '指定されたページが見つかりません': return ciddataa, notitle temp_out = template_cid(ciddataa) return temp_out, notitle
def findinfo(articleid): url = "https://www.dmm.co.jp/digital/videoa/-/list/=/article=actress/id=%s/" % articleid html = get_html_jp(url) page1 = re.findall( r'/digital/videoa/-/list/=/article=actress/id=\d+/page=(\d+)/', html) title = re.findall(r'<title>(.*) - エロ動画・アダルトビデオ - FANZA動画</title>', html) if page1 == []: page1 = 1 else: page3 = [] for i in page1: if i not in page3: page3.append(int(i)) page4 = max(page3) page1 = page4 title1 = title[0] return (page1, title1)
def prephotos(searchcid): searchurl = 'https://www.dmm.co.jp/digital/videoa/-/detail/=/cid={}/'.format( searchcid) html = get_html_jp(searchurl) soup = BeautifulSoup(html, 'lxml') photourlss = soup.find_all('img', attrs={'class': 'mg-b6'}) photourls = re.findall( r'(https://pics.dmm.co.jp/digital/video/.*?/.*?.jpg)', str(photourlss)) photolist = list(photourls) #print(photolist) jpg = [] for i in photolist: ii = list(i) ii.insert(-6, 'jp') iii = ''.join(ii) jpg.append(iii) return (jpg)
def dmmlinks_data(links): #url = 'https://www.dmm.co.jp/digital/videoa/-/list/search/=/?searchstr=乙白さやか' url = links html = get_html_jp(url) #判断有无结果 soup = BeautifulSoup(html, 'lxml') searchbody = soup.find('div', attrs={'class': 'd-area'}) try: stitle = re.findall(r'<title>(.*?)</title>', html)[0] #print(stitle) except Exception: stitle = '検索結果' boxall = searchbody.find_all('li', attrs={'style': 'width: 130px;'}) onebox = str(boxall).split('</div></li>') boxlist = [] for box in onebox: boxdict = {} notitle = 0 if box: try: litetitle = re.findall(r'<span class=\"txt\">(.*?)</span>', box)[0] #print(litetitle) if litetitle == None: notitle = 1 except: notitle = 1 try: cid = re.findall( r'<a href=\"https://www\.dmm\.co\.jp/digital/videoa/-/detail/=/cid=(\w+)/\?.*?\">', box)[0] #print(cid) boxdict['cid'] = cid except: boxdict['cid'] = '-' try: keywords = re.findall( r'<span class=\"ico-st-\w+\"><span>(.*?)</span></span>', box) keyword = ','.join(keywords) boxdict['keyword'] = keyword except: boxdict['keyword'] = '-' try: links = re.findall( r'<a href=\"(https://www\.dmm\.co\.jp/digital/videoa/-/detail/=/cid=\w+/\?.*?)\">', box)[0] boxdict['links'] = links except: boxdict['links'] = '-' try: img = re.findall( r'<span class=\"img\"><img alt=\".*?\" src=\"(https://pics.dmm.co.jp/digital/video/\w+/\w+.jpg)\"/></span>', box) boxdict['img'] = img[0] except: boxdict['img'] = '-' try: title = re.findall( r'<span class=\"img\"><img alt=\"(.*?)\" src=\"https://pics.dmm.co.jp/digital/video/\w+/\w+.jpg\"/></span>', box) boxdict['title'] = title[0] except: boxdict['title'] = '-' try: souplink = BeautifulSoup(box, 'lxml') slink = souplink.find('p', attrs={'class': 'sublink'}) #print(slink) #sublinks = re.findall(r'',box) except: pass try: sslink = slink.find('a').get('href') sublink = 'https://www.dmm.co.jp' + sslink boxdict['sublinks'] = sublink except: boxdict['sublinks'] = '-' try: sstext = slink.find('a').string #subtexts = re.findall(r'',box) boxdict['subtexts'] = sstext except: boxdict['subtexts'] = '-' if notitle == 0: #print(boxdict) boxlist.append(boxdict) return (boxlist, stitle)
def dmmsearch_data(searchstr): #url = 'https://www.dmm.co.jp/digital/videoa/-/list/search/=/?searchstr=乙白さやか' url = 'https://www.dmm.co.jp/digital/videoa/-/list/search/=/limit=30/?searchstr={}'.format( searchstr) html = get_html_jp(url) #判断有无结果 result = re.findall(r'(選択した条件で商品は存在しませんでした)', html) noresult = '選択した条件で商品は存在しませんでした' try: if noresult in result: stitle = 1 return (noresult, stitle) except Exception: pass soup = BeautifulSoup(html, 'lxml') searchbody = soup.find('div', attrs={'class': 'd-area'}) try: stitle = re.findall(r'<title>(.*?)</title>', html)[0] except Exception: stitle = '検索結果' boxall = searchbody.find_all('li', attrs={'style': 'width: 130px;'}) onebox = str(boxall).split('</div></li>') boxlist = [] for box in onebox: boxdict = {} notitle = 0 if box: try: litetitle = re.findall(r'<span class=\"txt\">(.*?)</span>', box)[0] #print(litetitle) if litetitle == None: notitle = 1 except: notitle = 1 try: cid = re.findall( r'<a href=\"https://www\.dmm\.co\.jp/digital/videoa/-/detail/=/cid=(\w+)/\?.*?\">', box)[0] boxdict['cid'] = cid except: boxdict['cid'] = '-' try: keywords = re.findall( r'<span class=\"ico-st-\w+\"><span>(.*?)</span></span>', box) keyword = ','.join(keywords) boxdict['keyword'] = keyword except: boxdict['keyword'] = '-' try: links = re.findall( r'<a href=\"(https://www\.dmm\.co\.jp/digital/videoa/-/detail/=/cid=\w+/\?.*?)\">', box)[0] boxdict['links'] = links except: boxdict['links'] = '-' try: img = re.findall( r'<span class=\"img\"><img alt=\".*?\" src=\"(https://pics.dmm.co.jp/digital/video/\w+/\w+.jpg)\"/></span>', box) boxdict['img'] = img[0] except: boxdict['img'] = '-' try: title = re.findall( r'<span class=\"img\"><img alt=\"(.*?)\" src=\"https://pics.dmm.co.jp/digital/video/\w+/\w+.jpg\"/></span>', box) boxdict['title'] = title[0] except: boxdict['title'] = '-' try: sublinks = re.findall( r'<span><a href=\"(/digital/videoa/-/list/search/=/limit=30/.*?)/\">.*?</a></span>', box) sublink = 'https://www.dmm.co.jp' + sublinks[0] boxdict['sublinks'] = sublink except: boxdict['sublinks'] = '-' try: subtexts = re.findall( r'<span><a href=\"/digital/videoa/-/list/search/=/limit=30/.*?/\">(.*?)</a></span>', box) boxdict['subtexts'] = subtexts[0] except: boxdict['subtexts'] = '-' if notitle == 0: #print(boxdict) boxlist.append(boxdict) return (boxlist, stitle)