Ejemplo n.º 1
0
def gen_tsv(urls, site, parser):
    """TSVデータジェネレータ"""
    total = rest = len(urls)

    for url in urls:
        verbose('url: ', url)

        resp, he = libssw.open_url(url)

        rest -= 1
        libssw.inprogress('(残り {} 件/全 {} 件)  '.format(rest, total))

        if resp.status == 404:
            continue

        title = he.xpath('.//title')[0].text

        if title.endswith('Error report') or title.startswith('未公開作品'):
            continue

        item = parser(he, site)
        if item:
            yield item

    print(file=sys.stderr, flush=True)
Ejemplo n.º 2
0
 def _build(aids):
     """女優名の整形とIDのペアを返す"""
     for aid in aids:
         resp, he = libssw.open_url(ACTINFOPAGE.format(aid),
                                    'utf-8',
                                    set_cookie='age_check_done=1')
         yield '/'.join(_getnames(he)), ACTLISTPAGE.format(aid)
Ejemplo n.º 3
0
def hitodumagiri(he, url):
    studio = '人妻斬り'

    performer = he.find_class('name_JP_hitozuma')[0].text.strip()
    age = libssw.extr_num(
        he.xpath('//table[@summary="movie info"][1]//tr[1]/td')[1].text)[0]

    title = performer + age + '才'

    qname = '+'.join(map(libssw.quote, performer.split()))
    srchurl = 'http://www.c0930.com/search/?q={}&x=0&y=0&category_search_type=and&flag_match_type=0'.format(
        qname)

    release = None
    while not release:
        resp, srchp = libssw.open_url(srchurl)
        for div in srchp.find_class('unit-thumbs ori1'):
            if div[1].get('href') == url:
                release = libssw.extr_num(div[0].text)
        else:
            nextbold = he.find_class('next bold')
            if nextbold is not None:
                srchurl = nextbold[0].get('href')

    uncensored(url, release, title, studio, [performer], '')
Ejemplo n.º 4
0
def get_elems(props):
    resp, he = libssw.open_url(props['url'])
    if resp.status != 200:
        emsg(
            'W', 'ページを取得できませんでした: '
            'url="{0[url]}", title={0[title]}, status={1}'.format(
                props, resp.status))
        return False

    return he.xpath('//td[@class="nw"]')
Ejemplo n.º 5
0
def open_wiki(*pages):
    """wikiページをウェブブラウザで開く"""
    for p in filter(None, pages):
        url = _up.urljoin('http://sougouwiki.com/d/', _libssw.quote(p))
        resp, he = _libssw.open_url(url)
        if resp.status == 200:
            dest = _libssw._rdrparser(p, he)
            if dest != p and dest != '':
                # リダイレクト先を再度読み込み
                url = _up.urljoin('http://sougouwiki.com/d/',
                                  _libssw.quote(dest))
                resp, he = _libssw.open_url(url)
            inner = he.find_class('inner')[0]
            editurl = inner.xpath('.//a')[0].get('href')
            if editurl:
                _webbrowser.open_new_tab(editurl)
        else:
            message = p + ' : ページが見つかりませんでした'
            label7.config(text=message)
Ejemplo n.º 6
0
    def parse(self, url):
        # lxmlバージョン 動かない
        self.url = url

        resp, he = libssw.open_url(self.url, set_cookie="adc=1")

        self.title = he.find_class('tag')[0].text_content()
        # self.title = he.find('h1')
        self.release = he.xpath(
            '//div[@class="detail_data"]/table[1]')[0].text_content()
        self.release = he.findall("table")
        # self.release = he.findall("table")[1].find("td")
        return self.title
Ejemplo n.º 7
0
    def parse_contents_market(self, url, he=None):
        self.url = url
        if he is None:
            resp, he = libssw.open_url(url, None)

        self.title = he.find_class('detail')[0].find('h2').text_content()
        self.release = he.find_class('main_info_block')[0].find('dl').findall(
            'dd')[3].text_content().split("/")
        self.series = he.find_class('main_info_block')[0].find('dl').findall(
            'dd')[4].text_content()
        # release = he.find_class('main_info_block')[0].find('h2').find('dd')[5].text_content()
        self.img_s = "http:" + he.find_class(
            'analyticsLinkClick_mainThum')[0].find('img').get('src')
        self.img_l = he.find_class('analyticsLinkClick_mainThum')[0].get(
            'href')
Ejemplo n.º 8
0
def searchwiki_by_url(url):
    """検索結果から記事名を返すジェネレータ"""
    resp, he = libssw.open_url(
        'http://sougouwiki.com/search?keywords={}'.format(libssw.quote(url),
                                                          cache=False))

    searesult = he.find_class('result-box')[0].xpath('p[1]/strong')[0].tail

    if searesult.strip() == 'に該当するページは見つかりませんでした。':
        verbose('url not found on ssw')
        return None

    while True:
        for a in he.iterfind('.//h3[@class="keyword"]/a'):
            yield a.get('href'), a.text

        # 次のページがあったらそちらで再度探す
        he = libssw.ssw_searchnext(he)
        if he is None:
            break
Ejemplo n.º 9
0
def check_actrpage(actr_url, listp, prod_url):
    """
    女優ページに作品情報があるか、一覧ページへのリンクがちゃんとあるかチェック
    """
    # 女優ページの取得
    resp, html = libssw.open_url(actr_url, cache=False, to_elems=False)
    if resp.status == 404:
        return False, 404, False

    # 女優ページ内の各行をチェックしDMMの作品ページURLがあればその行を取得
    for line in filter(lambda x: prod_url in x, html.splitlines()):

        link2list = re_ssw.findall(line)
        verbose('link to listpage: ', link2list)

        return (True, link2list, listp.lower()
                in tuple(L.lower() for L in link2list))

    # 作品情報が見つからなかったら
    verbose('prod info not found')
    return False, False, False
Ejemplo n.º 10
0
def ipondo(he, url):
    studio = '一本道'

    title = he.xpath('//h1')[0].getnext().text_content().strip()
    performers = he.xpath('//h1/a')[0].text.strip().split()

    # 検索結果ページから配信開始日を取得する
    # 検索文字列(URL)の作成
    qlist = performers[:]
    qlist.append(title)
    verbose('qlist: ', qlist)
    searchurl = 'http://www.1pondo.tv/list.php?q={}&op=and'.format('+'.join(
        libssw.quote(s) for s in qlist))
    verbose('searchurl: ', searchurl)

    release = None
    while not release:
        r, e = libssw.open_url(searchurl)
        for div in e.iterfind('.//div[@class="list_container"]/div'):
            a = div.xpath('a')[0]
            verbose('a: ', a.get('href'))
            if a.get('href') == url:
                release = libssw.extr_num(div.xpath('p')[0].text_content())
                break
        else:
            # ページ内に見つからなかったら次のページヘ
            for pagin in he.iterfind(
                    './/div[@class="listblock"]/p[@align="right"]/a'):
                if pagin.text.strip() == '次へ':
                    searchurl = 'http://www.1pondo.tv{}'.format(
                        pagin.get('href'))
                    break
            else:
                # 最後まで見つからなかったらダミーリストを返す
                emsg('W', '配信開始日を取得できませんでした。')
                release = tuple('0000', '00', '00')

    uncensored(url, release, title, studio, performers, '')
Ejemplo n.º 11
0
def main(props=_libssw.Summary(), p_args=_argparse.Namespace, dmmparser=None):

    # モジュール呼び出しの場合継承したコマンドライン引数は無視
    argv = [props.url] if __name__ != '__main__' else _sys.argv[1:]
    args = _get_args(argv, p_args)

    # 作品情報
    summ = _libssw.Summary()

    if __name__ == '__main__':
        _verbose('args: ', args)
        if not args.url:
            # URLが渡されなかったときは標準入力から
            _verbose('Input from stdin...')

            data = _sys.stdin.readline().rstrip('\n')

            if not data:
                _emsg('E', 'URLを指定してください。')

            for key, data in zip(('url', 'title', 'pid', 'actress', 'number',
                                  'director', 'director', 'note'),
                                 data.split('\t')):
                if key == 'url':
                    summ[key] = data.split('?')[0]
                elif key == 'actess':
                    summ[key] = list(_libssw.parse_names(data))
                elif key == 'number':
                    summ[key] = int(data) if data else 0
                elif key == 'director':
                    summ[key] = _libssw.re_delim.split(data)
                elif key == 'note':
                    summ[key].append(data)
                else:
                    summ[key] = data

            _verbose('summ from stdin: ', summ.items())

        for attr in ('url', 'number', 'pid', 'subtitle'):
            if not summ[attr]:
                summ[attr] = getattr(args, attr)

        if not summ['actress'] and args.actress:
            actiter = _chain.from_iterable(
                map(_libssw.re_delim.split, args.actress))
            summ['actress'] = list(_libssw.parse_names(actiter))

    else:
        _verbose('props: ', props.items())
        _verbose('p_args: ', vars(p_args))

        summ.update(props)

    summ['link_label'] = getattr(args, 'label')
    summ['link_series'] = getattr(args, 'series')

    retrieval = getattr(p_args, 'retrieval',
                        'series' if args.as_series else 'find')
    service = getattr(p_args, 'service', None)
    series_guide = getattr(p_args, 'series_guide', True)

    if args.actress and args.actress[0].startswith('@@'):
        # ウィキテキストで直接指定
        rawpfmrs = args.actress[0][2:]
    else:
        rawpfmrs = ''

    # サービス未指定時の自動決定
    if not service:
        service = _libssw.resolve_service(summ['url'])
    _verbose('service resolved: ', service)

    if service == 'ama':
        # 動画(素人)の場合監督欄は出力しない。
        args.dir_col = False

    join_d = dict()
    _libssw.ret_joindata(join_d, args)

    if (args.join_tsv or args.join_wiki or args.join_html) and not len(join_d):
        _emsg('E', '--join-* オプションで読み込んだデータが0件でした。')

    # URLを開いて読み込む
    resp, he = _libssw.open_url(summ['url'], set_cookie='age_check_done=1')

    if resp.status == 404:
        # 404の時、空のエントリを作成(表形式のみ)して返す
        _emsg('I', 'ページが見つかりませんでした: ', summ['url'])
        if not summ['pid']:
            summ['pid'], summ['cid'] = _libssw.gen_pid(summ['url'])
        if p_args.cid_l:
            summ['url'] = ''
        else:
            if not summ['subtitle']:
                summ['subtitle'] = summ['title']
            summ['image_sm'], summ['image_lg'] = _build_image_url(
                service, summ['cid'])
        wktxt_t = _format_wikitext_t(summ, '', '/'.join(summ['director']),
                                     args.dir_col,
                                     _build_addcols(args.add_column,
                                                    summ), retrieval)
        _verbose('wktxt_t: ', wktxt_t)
        return False, resp.status, _ReturnVal(summ['release'],
                                              summ['pid'],
                                              summ['title'],
                                              summ['title_dmm'],
                                              summ['url'],
                                              summ['time'],
                                              summ('maker', 'maker_id'),
                                              summ('label', 'label_id'),
                                              summ('series', 'series_id'),
                                              summ['actress'],
                                              summ['link_label'],
                                              summ['link_series'],
                                              wktxt_a='',
                                              wktxt_t=wktxt_t)
    elif resp.status != 200:
        return False, resp.status, ('HTTP status', resp.status)

    # 構文ミスの修正
    # html = _libssw.sub(sub_href, html)

    # HTMLの解析
    if not dmmparser:
        dmmparser = _libssw.DMMParser(autostrip=args.autostrip,
                                      longtitle=args.longtitle,
                                      check_rental=args.check_rental,
                                      check_rltd=args.check_rltd)

    try:
        summ.update(dmmparser(he, service, summ, ignore_pfmrs=rawpfmrs))
    except _libssw.OmitTitleException as e:
        # 除外対象なので中止
        return False, 'Omitted', (e.key, e.word)

    _verbose('summ: ', summ.items())

    if dmmparser.data_replaced:
        service = dmmparser.data_replaced

    # joinデータがあったら補完
    if summ['url'] in join_d:
        summ.merge(join_d[summ['url']])

    if args.pid:
        summ['pid'] = args.pid

    # 画像がまだないときのリンク自動生成
    if not summ['image_lg']:
        summ['image_sm'], summ['image_lg'] = _build_image_url(
            service, summ['cid'])
        _verbose('image_sm: ', summ['image_sm'])
        _verbose('image_lg: ', summ['image_lg'])

    #
    # タイトルの調整
    #
    # 削除依頼対応
    for dl in _libssw.HIDE_NAMES_V:
        summ['title'] = summ['title'].replace(dl, '').strip()

    on_dmm = summ['title']
    # wiki構文と衝突する文字列の置き換え
    modified = _libssw.trans_wikisyntax(on_dmm)
    if _AUTOMODIFY:
        # ♥の代替文字列の置き換え
        modified = _libssw.sub(_sub_heart, modified)

    summ['title'] = modified
    if not summ['title_dmm'] and modified != on_dmm:
        summ['title_dmm'] = on_dmm
    _verbose('summ[title]: ', summ['title'])
    _verbose('summ[title_dmm]: ', summ['title_dmm'])

    # レーベル/シリーズ一覧へのリンク情報の設定
    if (not args.hide_list) and args.check_listpage:
        _resolve_pagelink(summ, args)
    _verbose('summ[link_label]: ', summ['link_label'])
    _verbose('summ[link_series]: ', summ['link_series'])

    if args.note:
        summ['note'] = list(_expansion(args.note, summ)) + summ['note']
    _verbose('note: ', summ['note'])

    add_column = _build_addcols(args.add_column, summ)
    _verbose('add column: ', add_column)

    # 出演者文字列の作成
    pfmrslk = ()
    if rawpfmrs:
        # ウィキテキスト
        pfmrslk = _libssw.re_linkpare.findall(rawpfmrs)
        pfmrsstr, pnum = rawpfmrs, len(pfmrslk)
    elif len(summ['actress']) < 2 and not summ['number'] and args.table == 0:
        # 女優ページ用のみ作成で出演者数が1人ならやらない
        pfmrsstr, pnum = '', 0
    else:
        pfmrsstr, pnum = _libssw.stringize_performers(summ['actress'],
                                                      summ['number'],
                                                      args.follow_rdr)

    # 監督文字列の作成
    dirstr = '/'.join(summ['director'])

    # table形式用副題の生成
    if retrieval == 'series':
        # シリーズ名が list_page にあってタイトルの先頭からシリーズ名と
        # 同じ文字列があれば落とす。
        # list_page に値がなければタイトルをそのまま入れる。
        if not summ['subtitle']:
            summ['subtitle'] = _re.sub(r'^{}[、。!?・…♥]*'.format(summ['series']),
                                       '',
                                       summ['title'],
                                       flags=_re.I).strip()

    elif not summ['subtitle']:
        # タイトルをそのまま副題に(表形式用)
        summ['subtitle'] = summ['title']
    _verbose('subtitle: ', summ['subtitle'])

    # 未取得情報のチェック
    if _VERBOSE:
        _check_missings(summ)

    # ウィキテキストの作成
    wikitext_a = _format_wikitext_a(summ, pnum, pfmrsstr,
                                    service) if args.table != 1 else ()
    wikitext_t = _format_wikitext_t(summ, pfmrsstr, dirstr, args.dir_col,
                                    add_column,
                                    retrieval) if args.table else ''

    if __name__ != '__main__':
        # モジュール呼び出しならタプルで返す。
        return True, summ['url'], _ReturnVal(
            summ['release'], summ['pid'], summ['title'],
            summ['title_dmm'], summ['url'], summ['time'],
            summ('maker', 'maker_id'), summ('label', 'label_id'),
            summ('series', 'series_id'), summ['actress'], summ['link_label'],
            summ['link_series'], wikitext_a, wikitext_t)
    else:
        # 書き出す
        output = ['']
        if wikitext_a:
            output.append(wikitext_a)

        if wikitext_t:
            output.append(wikitext_t)

        print(*output, sep='\n')

        if args.copy:
            _verbose('copy 2 clipboard')
            _libssw.copy2clipboard(''.join(output))

        if args.browser:
            # wikiのページを開く
            if args.table != 1:
                pages = pfmrslk or summ['actress']
                for a in pages:
                    _libssw.open_ssw(a[1] or a[0])
            if args.table:
                _libssw.open_ssw(summ['link_label'])
                _libssw.open_ssw(summ['link_series'])
Ejemplo n.º 12
0
 def _build(aids):
     """女優名の整形とIDのペアを返す"""
     for aid in aids:
         resp, he = libssw.open_url(ACTINFOPAGE.format(aid))
         yield '/'.join(_getnames(he)), ACTLISTPAGE.format(aid)
Ejemplo n.º 13
0
def main():

    args = get_args()

    # urls = ('http://' + u if not u.startswith('http://') else u
    # for u in args.url)

    urls = ('https://' + u if not re.match('^https?://.+', u) else u
            for u in args.url)

    for url in urls:

        netloc = urlparse(url).netloc

        if netloc == 'www.mgstage.com':
            f = mgs()
            f.parse_by_bs4(url)
            # f.parse(url)
            f.print_cencered()

        else:
            resp, he = libssw.open_url(
                url,
                charset='euc_jisx0213'
                if netloc.endswith('h-paradise.net') else None)

            if netloc in ('www.aventertainments.com', 'www.avfantasy.com',
                          'www.mediafreakcity.com'):
                japorn(he, url)

            elif netloc == 'www.1pondo.tv':
                release, title, studio, performers, note = ipondo(he, url)

            elif netloc in ('www.caribbeancom.com', 'www.caribbeancompr.com'):
                caribbean(he, netloc, url)

            elif netloc == 'www.heyzo.com':
                heyzo(he, url)

            elif netloc == 'www.heydouga.com':
                heydouga(he, url)

            elif netloc == 'my.tokyo-hot.com':
                tokyohot(he, url)

            elif netloc == 'www.pacopacomama.com':
                pacopacomama(he, url)

            elif netloc == 'www.10musume.com':
                tenmusume(he, url)

            elif netloc == 'www.jukujo-club.com':
                jukujoclub(he, url)

            elif netloc.endswith('h-paradise.net'):
                h_paradise(he, url)

            elif netloc == 'www.c0930.com':
                hitodumagiri(he, url)

            elif netloc == 'adult.contents.fc2.com':
                f = fc2()
                fc2_id = f.url2id(url)
                f.fc2_id = fc2_id
                f.parse_contents_market(url, he=he)
                f.print_cencered()

            else:
                emsg('E', '未知のサイトです。')
Ejemplo n.º 14
0
def select_allhiragana(ids, today, path, selfcheck):
    no_omits = libssw.gen_no_omits((0, 4))

    dmmparser = libssw.DMMParser(no_omits=no_omits,
                                 deeper=False,
                                 pass_bd=True,
                                 quiet=True)
    conn = sqlite3.connect(path)
    cur = conn.cursor()

    if ids:
        sql = 'select id,current,last_release from main ' \
              'where id in({}) '.format(','.join('?' * len(ids)))
        ph = [sql, ids]
    else:
        sql = 'select id,current,last_release from main ' \
              'where last_release is not null and ' \
              'retired is null and deleted is null '
        ph = []

    if not selfcheck:
        ayearago = str(today - timedelta(days=365))
        sql += 'and last_release > ?'
        ph.append(ayearago)

    print('sql:', sql, ', ph:', ph)

    for aid, name, last_release in filter(
            lambda n: not libssw.re_neghirag.search(n[1]) and len(n[1]) < 5,
            cur.execute(sql, ph)):
        verbose(aid, name)
        print(aid, name + ': ', end='')
        url = 'http://actress.dmm.co.jp/-/detail/=/actress_id={}/'.format(aid)
        verbose('url: ', url)

        if not selfcheck:
            # 自分でチェックしないなら名前を返すだけ
            print('({})'.format(last_release))
            yield name
            continue

        resp, he = libssw.open_url(url)

        while he is not None:
            info = he.find('.//td[@class="info_works1"]')
            if info is None:
                print('negative')
                continue

            for tr in info.getparent().getparent()[1:]:
                title = tr.find('td/a').text
                verbose('title: ', title)

                if tr[4].text == '---' and tr[6].text == '---':
                    verbose('Not DVD and Rental')
                    continue

                sale = tr[4].find('a')
                rental = tr[6].find('a')
                prod_url = sale.get('href') if sale is not None \
                           else rental.get('href')
                verbose('prod url: ', prod_url)
                cid = libssw.get_id(prod_url, cid=True)[0]

                if libssw.check_omit(title, cid, no_omits=no_omits):
                    continue

                b, status, values = dmm2ssw.main(
                    props=libssw.Summary(url=prod_url),
                    p_args=argparse.Namespace(fastest=True, hide_list=True),
                    dmmparser=dmmparser)
                if status in {'Omitted', 404}:
                    verbose('Omitted: status=', status, ', values=', values)
                    continue
                verbose('return from dmm2ssw: ', values)

                last_web = tr[7].text.strip()
                if last_release != last_web:
                    print('last_rel is different (db:{} <=> web:{}), '.format(
                        last_release, last_web),
                          end='')
                datelast = date(*map(int, last_web.split('-')))

                if (today - datelast).days < 366:

                    yield name
                    print('\033[1;33mpositive ({})\033[0m'.format(last_web))

                else:
                    print('negative ({})'.format(last_web))

                he = None
                break

            else:
                mu = he.get_element_by_id("mu").xpath('table[last()]//a')
                if len(mu) and mu[-1].text == '次へ':
                    resp, he = libssw.open_url(
                        BASEURL_ACT + mu[-1].get('href'))
                else:
                    print('negative')
                    break

    conn.close()