def javbooks(avlist): s = requests.session() j = 0 f = open('url.txt', 'w') for i in avlist: a = conn.execute("select * from name where id=?", (i, )) if a.fetchone() != None: url = 'http://javbooks.com/serch_censored/' + i + '/serialall_1.htm' r = s.get(url) get_bt_url = bf(r.text) try: bt_url = get_bt_url.find('div', { 'class': 'Po_topic_title' }).find('a')['href'] r = s.get(bt_url) result = bf(r.text) #f.write(result.find('div',{'class':'dht_dl_title_content'}).find('a')['href']+str('\n')) conn.execute("insert into name (id) values(?)", (i, )) os.system("start " + result.find('div', { 'class': 'dht_dl_title_content' }).find('a')['href']) print "------------------------" sleep(5) except: j = j + 1 print i print j f.close()
def print_post(blog, text): #debug = blog.logger.debug def unescape(text): chars = {""": "\""} for c, r in chars.iteritems(): text = text.replace(c, r) return text from BeautifulSoup import BeautifulSoup as bf from re import findall from os import popen3 if findall(r"% [^\n]*mdown", text): # Le texte devra contenir une ligne avec une instruction au format mdown # contenant ce nom. # On utilise un format spécial pour préciser le langage de code qui suivra. # ceci signifie que chaque code devra être précédé de cette description, # sinon, ça mettrait le chaos dans les balises. Et on ne veut pas ça. code_infos = findall(r"% \[code\] lang *[:=] *(\w+) *", text) f_in, f_out, f_err = popen3("mdown -f xhtml -B tdql -b xml") f_in.write(text.encode("utf-8", "ignore")) f_in.close() out, err = f_out.read(), f_err.read() f_out.close() f_err.close() if err: blog.logger.warning(u"Mdown : %s" % err) soup = bf(unescape(out)) for elem, lang in zip(soup.findAll("code"), code_infos): elem.parent.replaceWith(elem) elem["lang"] = lang return unicode(soup.prettify(), "utf-8") else: return text
def get_avlist(): avlist = [] pat = '[\w+\W+]+cid=([a-z]+)00([0-9]+)' pat1 = '[\w+\W+]+cid=[0-9]+([a-z]+)00([0-9]+)' pat2 = '[\w+\W+]+cid=[a-z]_[0-9]+([a-zA-Z]+)00([0-9]+)' pat3 = '[\w+\W+]+cid=([\w+])/' s = requests.session() r = s.get('http://www.dmm.co.jp/digital/videoa/-/list/=/sort=ranking/') alllist = bf(r.text) result = alllist.find('ul', {'id': 'list'}).findAll('li') for i in result: cid = i.find('p', {'class': 'tmb'}).find('a')['href'] if re.match(pat, cid) is not None: video_id = re.match(pat, cid).group(1) + '-' + re.match( pat, cid).group(2) elif re.match(pat1, cid) is not None: video_id = re.match(pat1, cid).group(1) + '-' + re.match( pat1, cid).group(2) elif re.match(pat2, cid) is not None: video_id = re.match(pat2, cid).group(1) + '-' + re.match( pat2, cid).group(2) else: print re.match(pat3, cid).group(1) #print 'error' avlist.append(video_id) return avlist
def sukebei(avlist): s = requests.session() j = 0 for i in avlist: url = 'https://sukebei.nyaa.se/?page=search&cats=8_0&filter=0&term=' + i r = s.get(url) get_bt_url = bf(r.text) try: print get_bt_url.find('tr', { 'class': 'tlistrow trusted' }).find('td', { 'class': 'tlistname' }).find('a')['href'] except: j = j + 1 print i print j
def print_post(blog, text): from pygments import highlight from pygments.lexers import guess_lexer, get_lexer_by_name from pygments.formatters import HtmlFormatter from pygments.util import ClassNotFound from BeautifulSoup import BeautifulSoup as bf def debug(text): import codecs with codecs.open("debug.log", "a", "utf-8") as f: f.write(str(text)+"\n") def unescape(text): return text.replace("&", "&") #replace("<", "<"). \ #replace(">", ">"). \ #replace(""", "\""). \ #replace("'", "'") debug = blog.logger.debug soup = bf(text) for elem in soup.findAll("code"): try: lang = elem["lang"] except KeyError: lang = "text" new_elem = elem.findChild("pre") if elem.findChild("pre") else elem level = 0 #while not (isinstance(new_elem, unicode) or level > 3): # level += 1 # new_elem = new_elem.next content = unicode(new_elem.renderContents(), "utf-8") try: lexer = get_lexer_by_name(lang) except ClassNotFound: try: lexer = guess_lexer(content.lstrip()) except ClassNotFound: lexer = get_lexer_by_name("text") formatter = HtmlFormatter(linenos="inline") elem.next.extract() new_content = highlight(content, lexer, formatter) elem.replaceWith(unescape(new_content)) return unicode(soup.prettify(), "utf-8")