def gen_tsv(urls, site, parser): """TSVデータジェネレータ""" total = rest = len(urls) for url in urls: verbose('url: ', url) resp, he = libssw.open_url(url) rest -= 1 libssw.inprogress('(残り {} 件/全 {} 件) '.format(rest, total)) if resp.status == 404: continue title = he.xpath('.//title')[0].text if title.endswith('Error report') or title.startswith('未公開作品'): continue item = parser(he, site) if item: yield item print(file=sys.stderr, flush=True)
def _build(aids): """女優名の整形とIDのペアを返す""" for aid in aids: resp, he = libssw.open_url(ACTINFOPAGE.format(aid), 'utf-8', set_cookie='age_check_done=1') yield '/'.join(_getnames(he)), ACTLISTPAGE.format(aid)
def hitodumagiri(he, url): studio = '人妻斬り' performer = he.find_class('name_JP_hitozuma')[0].text.strip() age = libssw.extr_num( he.xpath('//table[@summary="movie info"][1]//tr[1]/td')[1].text)[0] title = performer + age + '才' qname = '+'.join(map(libssw.quote, performer.split())) srchurl = 'http://www.c0930.com/search/?q={}&x=0&y=0&category_search_type=and&flag_match_type=0'.format( qname) release = None while not release: resp, srchp = libssw.open_url(srchurl) for div in srchp.find_class('unit-thumbs ori1'): if div[1].get('href') == url: release = libssw.extr_num(div[0].text) else: nextbold = he.find_class('next bold') if nextbold is not None: srchurl = nextbold[0].get('href') uncensored(url, release, title, studio, [performer], '')
def get_elems(props): resp, he = libssw.open_url(props['url']) if resp.status != 200: emsg( 'W', 'ページを取得できませんでした: ' 'url="{0[url]}", title={0[title]}, status={1}'.format( props, resp.status)) return False return he.xpath('//td[@class="nw"]')
def open_wiki(*pages): """wikiページをウェブブラウザで開く""" for p in filter(None, pages): url = _up.urljoin('http://sougouwiki.com/d/', _libssw.quote(p)) resp, he = _libssw.open_url(url) if resp.status == 200: dest = _libssw._rdrparser(p, he) if dest != p and dest != '': # リダイレクト先を再度読み込み url = _up.urljoin('http://sougouwiki.com/d/', _libssw.quote(dest)) resp, he = _libssw.open_url(url) inner = he.find_class('inner')[0] editurl = inner.xpath('.//a')[0].get('href') if editurl: _webbrowser.open_new_tab(editurl) else: message = p + ' : ページが見つかりませんでした' label7.config(text=message)
def parse(self, url): # lxmlバージョン 動かない self.url = url resp, he = libssw.open_url(self.url, set_cookie="adc=1") self.title = he.find_class('tag')[0].text_content() # self.title = he.find('h1') self.release = he.xpath( '//div[@class="detail_data"]/table[1]')[0].text_content() self.release = he.findall("table") # self.release = he.findall("table")[1].find("td") return self.title
def parse_contents_market(self, url, he=None): self.url = url if he is None: resp, he = libssw.open_url(url, None) self.title = he.find_class('detail')[0].find('h2').text_content() self.release = he.find_class('main_info_block')[0].find('dl').findall( 'dd')[3].text_content().split("/") self.series = he.find_class('main_info_block')[0].find('dl').findall( 'dd')[4].text_content() # release = he.find_class('main_info_block')[0].find('h2').find('dd')[5].text_content() self.img_s = "http:" + he.find_class( 'analyticsLinkClick_mainThum')[0].find('img').get('src') self.img_l = he.find_class('analyticsLinkClick_mainThum')[0].get( 'href')
def searchwiki_by_url(url): """検索結果から記事名を返すジェネレータ""" resp, he = libssw.open_url( 'http://sougouwiki.com/search?keywords={}'.format(libssw.quote(url), cache=False)) searesult = he.find_class('result-box')[0].xpath('p[1]/strong')[0].tail if searesult.strip() == 'に該当するページは見つかりませんでした。': verbose('url not found on ssw') return None while True: for a in he.iterfind('.//h3[@class="keyword"]/a'): yield a.get('href'), a.text # 次のページがあったらそちらで再度探す he = libssw.ssw_searchnext(he) if he is None: break
def check_actrpage(actr_url, listp, prod_url): """ 女優ページに作品情報があるか、一覧ページへのリンクがちゃんとあるかチェック """ # 女優ページの取得 resp, html = libssw.open_url(actr_url, cache=False, to_elems=False) if resp.status == 404: return False, 404, False # 女優ページ内の各行をチェックしDMMの作品ページURLがあればその行を取得 for line in filter(lambda x: prod_url in x, html.splitlines()): link2list = re_ssw.findall(line) verbose('link to listpage: ', link2list) return (True, link2list, listp.lower() in tuple(L.lower() for L in link2list)) # 作品情報が見つからなかったら verbose('prod info not found') return False, False, False
def ipondo(he, url): studio = '一本道' title = he.xpath('//h1')[0].getnext().text_content().strip() performers = he.xpath('//h1/a')[0].text.strip().split() # 検索結果ページから配信開始日を取得する # 検索文字列(URL)の作成 qlist = performers[:] qlist.append(title) verbose('qlist: ', qlist) searchurl = 'http://www.1pondo.tv/list.php?q={}&op=and'.format('+'.join( libssw.quote(s) for s in qlist)) verbose('searchurl: ', searchurl) release = None while not release: r, e = libssw.open_url(searchurl) for div in e.iterfind('.//div[@class="list_container"]/div'): a = div.xpath('a')[0] verbose('a: ', a.get('href')) if a.get('href') == url: release = libssw.extr_num(div.xpath('p')[0].text_content()) break else: # ページ内に見つからなかったら次のページヘ for pagin in he.iterfind( './/div[@class="listblock"]/p[@align="right"]/a'): if pagin.text.strip() == '次へ': searchurl = 'http://www.1pondo.tv{}'.format( pagin.get('href')) break else: # 最後まで見つからなかったらダミーリストを返す emsg('W', '配信開始日を取得できませんでした。') release = tuple('0000', '00', '00') uncensored(url, release, title, studio, performers, '')
def main(props=_libssw.Summary(), p_args=_argparse.Namespace, dmmparser=None): # モジュール呼び出しの場合継承したコマンドライン引数は無視 argv = [props.url] if __name__ != '__main__' else _sys.argv[1:] args = _get_args(argv, p_args) # 作品情報 summ = _libssw.Summary() if __name__ == '__main__': _verbose('args: ', args) if not args.url: # URLが渡されなかったときは標準入力から _verbose('Input from stdin...') data = _sys.stdin.readline().rstrip('\n') if not data: _emsg('E', 'URLを指定してください。') for key, data in zip(('url', 'title', 'pid', 'actress', 'number', 'director', 'director', 'note'), data.split('\t')): if key == 'url': summ[key] = data.split('?')[0] elif key == 'actess': summ[key] = list(_libssw.parse_names(data)) elif key == 'number': summ[key] = int(data) if data else 0 elif key == 'director': summ[key] = _libssw.re_delim.split(data) elif key == 'note': summ[key].append(data) else: summ[key] = data _verbose('summ from stdin: ', summ.items()) for attr in ('url', 'number', 'pid', 'subtitle'): if not summ[attr]: summ[attr] = getattr(args, attr) if not summ['actress'] and args.actress: actiter = _chain.from_iterable( map(_libssw.re_delim.split, args.actress)) summ['actress'] = list(_libssw.parse_names(actiter)) else: _verbose('props: ', props.items()) _verbose('p_args: ', vars(p_args)) summ.update(props) summ['link_label'] = getattr(args, 'label') summ['link_series'] = getattr(args, 'series') retrieval = getattr(p_args, 'retrieval', 'series' if args.as_series else 'find') service = getattr(p_args, 'service', None) series_guide = getattr(p_args, 'series_guide', True) if args.actress and args.actress[0].startswith('@@'): # ウィキテキストで直接指定 rawpfmrs = args.actress[0][2:] else: rawpfmrs = '' # サービス未指定時の自動決定 if not service: service = _libssw.resolve_service(summ['url']) _verbose('service resolved: ', service) if service == 'ama': # 動画(素人)の場合監督欄は出力しない。 args.dir_col = False join_d = dict() _libssw.ret_joindata(join_d, args) if (args.join_tsv or args.join_wiki or args.join_html) and not len(join_d): _emsg('E', '--join-* オプションで読み込んだデータが0件でした。') # URLを開いて読み込む resp, he = _libssw.open_url(summ['url'], set_cookie='age_check_done=1') if resp.status == 404: # 404の時、空のエントリを作成(表形式のみ)して返す _emsg('I', 'ページが見つかりませんでした: ', summ['url']) if not summ['pid']: summ['pid'], summ['cid'] = _libssw.gen_pid(summ['url']) if p_args.cid_l: summ['url'] = '' else: if not summ['subtitle']: summ['subtitle'] = summ['title'] summ['image_sm'], summ['image_lg'] = _build_image_url( service, summ['cid']) wktxt_t = _format_wikitext_t(summ, '', '/'.join(summ['director']), args.dir_col, _build_addcols(args.add_column, summ), retrieval) _verbose('wktxt_t: ', wktxt_t) return False, resp.status, _ReturnVal(summ['release'], summ['pid'], summ['title'], summ['title_dmm'], summ['url'], summ['time'], summ('maker', 'maker_id'), summ('label', 'label_id'), summ('series', 'series_id'), summ['actress'], summ['link_label'], summ['link_series'], wktxt_a='', wktxt_t=wktxt_t) elif resp.status != 200: return False, resp.status, ('HTTP status', resp.status) # 構文ミスの修正 # html = _libssw.sub(sub_href, html) # HTMLの解析 if not dmmparser: dmmparser = _libssw.DMMParser(autostrip=args.autostrip, longtitle=args.longtitle, check_rental=args.check_rental, check_rltd=args.check_rltd) try: summ.update(dmmparser(he, service, summ, ignore_pfmrs=rawpfmrs)) except _libssw.OmitTitleException as e: # 除外対象なので中止 return False, 'Omitted', (e.key, e.word) _verbose('summ: ', summ.items()) if dmmparser.data_replaced: service = dmmparser.data_replaced # joinデータがあったら補完 if summ['url'] in join_d: summ.merge(join_d[summ['url']]) if args.pid: summ['pid'] = args.pid # 画像がまだないときのリンク自動生成 if not summ['image_lg']: summ['image_sm'], summ['image_lg'] = _build_image_url( service, summ['cid']) _verbose('image_sm: ', summ['image_sm']) _verbose('image_lg: ', summ['image_lg']) # # タイトルの調整 # # 削除依頼対応 for dl in _libssw.HIDE_NAMES_V: summ['title'] = summ['title'].replace(dl, '').strip() on_dmm = summ['title'] # wiki構文と衝突する文字列の置き換え modified = _libssw.trans_wikisyntax(on_dmm) if _AUTOMODIFY: # ♥の代替文字列の置き換え modified = _libssw.sub(_sub_heart, modified) summ['title'] = modified if not summ['title_dmm'] and modified != on_dmm: summ['title_dmm'] = on_dmm _verbose('summ[title]: ', summ['title']) _verbose('summ[title_dmm]: ', summ['title_dmm']) # レーベル/シリーズ一覧へのリンク情報の設定 if (not args.hide_list) and args.check_listpage: _resolve_pagelink(summ, args) _verbose('summ[link_label]: ', summ['link_label']) _verbose('summ[link_series]: ', summ['link_series']) if args.note: summ['note'] = list(_expansion(args.note, summ)) + summ['note'] _verbose('note: ', summ['note']) add_column = _build_addcols(args.add_column, summ) _verbose('add column: ', add_column) # 出演者文字列の作成 pfmrslk = () if rawpfmrs: # ウィキテキスト pfmrslk = _libssw.re_linkpare.findall(rawpfmrs) pfmrsstr, pnum = rawpfmrs, len(pfmrslk) elif len(summ['actress']) < 2 and not summ['number'] and args.table == 0: # 女優ページ用のみ作成で出演者数が1人ならやらない pfmrsstr, pnum = '', 0 else: pfmrsstr, pnum = _libssw.stringize_performers(summ['actress'], summ['number'], args.follow_rdr) # 監督文字列の作成 dirstr = '/'.join(summ['director']) # table形式用副題の生成 if retrieval == 'series': # シリーズ名が list_page にあってタイトルの先頭からシリーズ名と # 同じ文字列があれば落とす。 # list_page に値がなければタイトルをそのまま入れる。 if not summ['subtitle']: summ['subtitle'] = _re.sub(r'^{}[、。!?・…♥]*'.format(summ['series']), '', summ['title'], flags=_re.I).strip() elif not summ['subtitle']: # タイトルをそのまま副題に(表形式用) summ['subtitle'] = summ['title'] _verbose('subtitle: ', summ['subtitle']) # 未取得情報のチェック if _VERBOSE: _check_missings(summ) # ウィキテキストの作成 wikitext_a = _format_wikitext_a(summ, pnum, pfmrsstr, service) if args.table != 1 else () wikitext_t = _format_wikitext_t(summ, pfmrsstr, dirstr, args.dir_col, add_column, retrieval) if args.table else '' if __name__ != '__main__': # モジュール呼び出しならタプルで返す。 return True, summ['url'], _ReturnVal( summ['release'], summ['pid'], summ['title'], summ['title_dmm'], summ['url'], summ['time'], summ('maker', 'maker_id'), summ('label', 'label_id'), summ('series', 'series_id'), summ['actress'], summ['link_label'], summ['link_series'], wikitext_a, wikitext_t) else: # 書き出す output = [''] if wikitext_a: output.append(wikitext_a) if wikitext_t: output.append(wikitext_t) print(*output, sep='\n') if args.copy: _verbose('copy 2 clipboard') _libssw.copy2clipboard(''.join(output)) if args.browser: # wikiのページを開く if args.table != 1: pages = pfmrslk or summ['actress'] for a in pages: _libssw.open_ssw(a[1] or a[0]) if args.table: _libssw.open_ssw(summ['link_label']) _libssw.open_ssw(summ['link_series'])
def _build(aids): """女優名の整形とIDのペアを返す""" for aid in aids: resp, he = libssw.open_url(ACTINFOPAGE.format(aid)) yield '/'.join(_getnames(he)), ACTLISTPAGE.format(aid)
def main(): args = get_args() # urls = ('http://' + u if not u.startswith('http://') else u # for u in args.url) urls = ('https://' + u if not re.match('^https?://.+', u) else u for u in args.url) for url in urls: netloc = urlparse(url).netloc if netloc == 'www.mgstage.com': f = mgs() f.parse_by_bs4(url) # f.parse(url) f.print_cencered() else: resp, he = libssw.open_url( url, charset='euc_jisx0213' if netloc.endswith('h-paradise.net') else None) if netloc in ('www.aventertainments.com', 'www.avfantasy.com', 'www.mediafreakcity.com'): japorn(he, url) elif netloc == 'www.1pondo.tv': release, title, studio, performers, note = ipondo(he, url) elif netloc in ('www.caribbeancom.com', 'www.caribbeancompr.com'): caribbean(he, netloc, url) elif netloc == 'www.heyzo.com': heyzo(he, url) elif netloc == 'www.heydouga.com': heydouga(he, url) elif netloc == 'my.tokyo-hot.com': tokyohot(he, url) elif netloc == 'www.pacopacomama.com': pacopacomama(he, url) elif netloc == 'www.10musume.com': tenmusume(he, url) elif netloc == 'www.jukujo-club.com': jukujoclub(he, url) elif netloc.endswith('h-paradise.net'): h_paradise(he, url) elif netloc == 'www.c0930.com': hitodumagiri(he, url) elif netloc == 'adult.contents.fc2.com': f = fc2() fc2_id = f.url2id(url) f.fc2_id = fc2_id f.parse_contents_market(url, he=he) f.print_cencered() else: emsg('E', '未知のサイトです。')
def select_allhiragana(ids, today, path, selfcheck): no_omits = libssw.gen_no_omits((0, 4)) dmmparser = libssw.DMMParser(no_omits=no_omits, deeper=False, pass_bd=True, quiet=True) conn = sqlite3.connect(path) cur = conn.cursor() if ids: sql = 'select id,current,last_release from main ' \ 'where id in({}) '.format(','.join('?' * len(ids))) ph = [sql, ids] else: sql = 'select id,current,last_release from main ' \ 'where last_release is not null and ' \ 'retired is null and deleted is null ' ph = [] if not selfcheck: ayearago = str(today - timedelta(days=365)) sql += 'and last_release > ?' ph.append(ayearago) print('sql:', sql, ', ph:', ph) for aid, name, last_release in filter( lambda n: not libssw.re_neghirag.search(n[1]) and len(n[1]) < 5, cur.execute(sql, ph)): verbose(aid, name) print(aid, name + ': ', end='') url = 'http://actress.dmm.co.jp/-/detail/=/actress_id={}/'.format(aid) verbose('url: ', url) if not selfcheck: # 自分でチェックしないなら名前を返すだけ print('({})'.format(last_release)) yield name continue resp, he = libssw.open_url(url) while he is not None: info = he.find('.//td[@class="info_works1"]') if info is None: print('negative') continue for tr in info.getparent().getparent()[1:]: title = tr.find('td/a').text verbose('title: ', title) if tr[4].text == '---' and tr[6].text == '---': verbose('Not DVD and Rental') continue sale = tr[4].find('a') rental = tr[6].find('a') prod_url = sale.get('href') if sale is not None \ else rental.get('href') verbose('prod url: ', prod_url) cid = libssw.get_id(prod_url, cid=True)[0] if libssw.check_omit(title, cid, no_omits=no_omits): continue b, status, values = dmm2ssw.main( props=libssw.Summary(url=prod_url), p_args=argparse.Namespace(fastest=True, hide_list=True), dmmparser=dmmparser) if status in {'Omitted', 404}: verbose('Omitted: status=', status, ', values=', values) continue verbose('return from dmm2ssw: ', values) last_web = tr[7].text.strip() if last_release != last_web: print('last_rel is different (db:{} <=> web:{}), '.format( last_release, last_web), end='') datelast = date(*map(int, last_web.split('-'))) if (today - datelast).days < 366: yield name print('\033[1;33mpositive ({})\033[0m'.format(last_web)) else: print('negative ({})'.format(last_web)) he = None break else: mu = he.get_element_by_id("mu").xpath('table[last()]//a') if len(mu) and mu[-1].text == '次へ': resp, he = libssw.open_url( BASEURL_ACT + mu[-1].get('href')) else: print('negative') break conn.close()