Python get_html_jp Exemples, function_requests.get_html_jp Python Exemples

Exemple #1

0

Afficher le fichier

def dmmcid(in_q, out_q):
    while in_q.empty() is not True:
        url = in_q.get()
        html = get_html_jp(url)
        list = re.findall(
            r'https://www\.dmm\.co\.jp/digital/videoa/-/detail/=/cid=([_0-9a-z]+)/',
            html)
        #print(list)
        out_q.append(list)
        in_q.task_done()

Exemple #2

0

Afficher le fichier

def dmmonecid(searchcid):
    searchcid = searchcid.replace('-', '00')
    searchurl = 'https://www.dmm.co.jp/digital/videoa/-/detail/=/cid={}/'.format(
        searchcid)
    html = get_html_jp(searchurl)
    ciddataa, notitle = ciddata(html)
    if ciddataa == '指定されたページが見つかりません':
        return ciddataa, notitle
    temp_out = template_cid(ciddataa)
    return temp_out, notitle

Exemple #3

0

Afficher le fichier

def findinfo(articleid):
    url = "https://www.dmm.co.jp/digital/videoa/-/list/=/article=actress/id=%s/" % articleid
    html = get_html_jp(url)
    page1 = re.findall(
        r'/digital/videoa/-/list/=/article=actress/id=\d+/page=(\d+)/', html)
    title = re.findall(r'<title>(.*) - エロ動画・アダルトビデオ - FANZA動画</title>', html)
    if page1 == []:
        page1 = 1
    else:
        page3 = []
        for i in page1:
            if i not in page3:
                page3.append(int(i))
        page4 = max(page3)
        page1 = page4
    title1 = title[0]
    return (page1, title1)

Exemple #4

0

Afficher le fichier

def prephotos(searchcid):
    searchurl = 'https://www.dmm.co.jp/digital/videoa/-/detail/=/cid={}/'.format(
        searchcid)
    html = get_html_jp(searchurl)
    soup = BeautifulSoup(html, 'lxml')
    photourlss = soup.find_all('img', attrs={'class': 'mg-b6'})
    photourls = re.findall(
        r'(https://pics.dmm.co.jp/digital/video/.*?/.*?.jpg)', str(photourlss))
    photolist = list(photourls)
    #print(photolist)
    jpg = []
    for i in photolist:
        ii = list(i)
        ii.insert(-6, 'jp')
        iii = ''.join(ii)
        jpg.append(iii)

    return (jpg)

Exemple #5

0

Afficher le fichier

def dmmlinks_data(links):
    #url = 'https://www.dmm.co.jp/digital/videoa/-/list/search/=/?searchstr=乙白さやか'
    url = links
    html = get_html_jp(url)
    #判断有无结果
    soup = BeautifulSoup(html, 'lxml')
    searchbody = soup.find('div', attrs={'class': 'd-area'})
    try:
        stitle = re.findall(r'<title>(.*?)</title>', html)[0]
        #print(stitle)
    except Exception:
        stitle = '検索結果'
    boxall = searchbody.find_all('li', attrs={'style': 'width: 130px;'})
    onebox = str(boxall).split('</div></li>')
    boxlist = []
    for box in onebox:
        boxdict = {}
        notitle = 0
        if box:
            try:
                litetitle = re.findall(r'<span class=\"txt\">(.*?)</span>',
                                       box)[0]
                #print(litetitle)
                if litetitle == None:
                    notitle = 1
            except:
                notitle = 1
            try:
                cid = re.findall(
                    r'<a href=\"https://www\.dmm\.co\.jp/digital/videoa/-/detail/=/cid=(\w+)/\?.*?\">',
                    box)[0]
                #print(cid)
                boxdict['cid'] = cid
            except:
                boxdict['cid'] = '-'
            try:
                keywords = re.findall(
                    r'<span class=\"ico-st-\w+\"><span>(.*?)</span></span>',
                    box)
                keyword = ','.join(keywords)
                boxdict['keyword'] = keyword
            except:
                boxdict['keyword'] = '-'
            try:
                links = re.findall(
                    r'<a href=\"(https://www\.dmm\.co\.jp/digital/videoa/-/detail/=/cid=\w+/\?.*?)\">',
                    box)[0]
                boxdict['links'] = links
            except:
                boxdict['links'] = '-'
            try:
                img = re.findall(
                    r'<span class=\"img\"><img alt=\".*?\" src=\"(https://pics.dmm.co.jp/digital/video/\w+/\w+.jpg)\"/></span>',
                    box)
                boxdict['img'] = img[0]
            except:
                boxdict['img'] = '-'
            try:
                title = re.findall(
                    r'<span class=\"img\"><img alt=\"(.*?)\" src=\"https://pics.dmm.co.jp/digital/video/\w+/\w+.jpg\"/></span>',
                    box)
                boxdict['title'] = title[0]
            except:
                boxdict['title'] = '-'
            try:
                souplink = BeautifulSoup(box, 'lxml')
                slink = souplink.find('p', attrs={'class': 'sublink'})
                #print(slink)
                #sublinks = re.findall(r'',box)
            except:
                pass
            try:
                sslink = slink.find('a').get('href')
                sublink = 'https://www.dmm.co.jp' + sslink
                boxdict['sublinks'] = sublink
            except:
                boxdict['sublinks'] = '-'
            try:
                sstext = slink.find('a').string
                #subtexts = re.findall(r'',box)
                boxdict['subtexts'] = sstext
            except:
                boxdict['subtexts'] = '-'

            if notitle == 0:
                #print(boxdict)
                boxlist.append(boxdict)
    return (boxlist, stitle)

Exemple #6

0

Afficher le fichier

def dmmsearch_data(searchstr):
    #url = 'https://www.dmm.co.jp/digital/videoa/-/list/search/=/?searchstr=乙白さやか'
    url = 'https://www.dmm.co.jp/digital/videoa/-/list/search/=/limit=30/?searchstr={}'.format(
        searchstr)
    html = get_html_jp(url)
    #判断有无结果
    result = re.findall(r'(選択した条件で商品は存在しませんでした)', html)
    noresult = '選択した条件で商品は存在しませんでした'
    try:
        if noresult in result:
            stitle = 1
            return (noresult, stitle)
    except Exception:
        pass

    soup = BeautifulSoup(html, 'lxml')
    searchbody = soup.find('div', attrs={'class': 'd-area'})
    try:
        stitle = re.findall(r'<title>(.*?)</title>', html)[0]
    except Exception:
        stitle = '検索結果'
    boxall = searchbody.find_all('li', attrs={'style': 'width: 130px;'})
    onebox = str(boxall).split('</div></li>')
    boxlist = []
    for box in onebox:
        boxdict = {}
        notitle = 0
        if box:
            try:
                litetitle = re.findall(r'<span class=\"txt\">(.*?)</span>',
                                       box)[0]
                #print(litetitle)
                if litetitle == None:
                    notitle = 1
            except:
                notitle = 1
            try:
                cid = re.findall(
                    r'<a href=\"https://www\.dmm\.co\.jp/digital/videoa/-/detail/=/cid=(\w+)/\?.*?\">',
                    box)[0]
                boxdict['cid'] = cid
            except:
                boxdict['cid'] = '-'
            try:
                keywords = re.findall(
                    r'<span class=\"ico-st-\w+\"><span>(.*?)</span></span>',
                    box)
                keyword = ','.join(keywords)
                boxdict['keyword'] = keyword
            except:
                boxdict['keyword'] = '-'
            try:
                links = re.findall(
                    r'<a href=\"(https://www\.dmm\.co\.jp/digital/videoa/-/detail/=/cid=\w+/\?.*?)\">',
                    box)[0]
                boxdict['links'] = links
            except:
                boxdict['links'] = '-'
            try:
                img = re.findall(
                    r'<span class=\"img\"><img alt=\".*?\" src=\"(https://pics.dmm.co.jp/digital/video/\w+/\w+.jpg)\"/></span>',
                    box)
                boxdict['img'] = img[0]
            except:
                boxdict['img'] = '-'
            try:
                title = re.findall(
                    r'<span class=\"img\"><img alt=\"(.*?)\" src=\"https://pics.dmm.co.jp/digital/video/\w+/\w+.jpg\"/></span>',
                    box)
                boxdict['title'] = title[0]
            except:
                boxdict['title'] = '-'
            try:
                sublinks = re.findall(
                    r'<span><a href=\"(/digital/videoa/-/list/search/=/limit=30/.*?)/\">.*?</a></span>',
                    box)
                sublink = 'https://www.dmm.co.jp' + sublinks[0]
                boxdict['sublinks'] = sublink
            except:
                boxdict['sublinks'] = '-'
            try:
                subtexts = re.findall(
                    r'<span><a href=\"/digital/videoa/-/list/search/=/limit=30/.*?/\">(.*?)</a></span>',
                    box)
                boxdict['subtexts'] = subtexts[0]
            except:
                boxdict['subtexts'] = '-'

            if notitle == 0:
                #print(boxdict)
                boxlist.append(boxdict)
    return (boxlist, stitle)