Esempio n. 1
0
def setup(options):
	from episodes_pb2 import All

	db = All()
	db.ParseFromString(open(options.database,"rb").read())
	yesterday = date.fromtimestamp(time())-timedelta(days=1)
	yesterday = yesterday.timetuple()

	cache = Cache(debug=options.debug)
	cache.user_agent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3"

	items = {"yesterday":yesterday, "cache":cache, "db": db}
	for x in items:
		globals()[x] = items[x]
	return items
Esempio n. 2
0
from common import *
from sys import argv
from urlgrab import Cache
from re import compile, DOTALL, MULTILINE

cache = Cache()
url = argv[1]

titlePattern = compile("<h1>([^<]+)</h1>")
contentPattern = compile("<div class=\"b-story-body-x x-r15\">(.+?)</div><div class=\"b-story-stats-block\">" , DOTALL|MULTILINE)
nextPattern = compile("\"([^\"]+)\">Next</a>")

chapterPattern = compile("(.*?) (?:Ch.|Pt.) (\d+)")
memberPattern = compile("<a href=\"(https://www.literotica.com/stories/memberpage.php\?uid=\d+&amp;page=submissions)\">([^<]+)</a>")
chapterLinkPattern = compile("href=\"(https://www.literotica.com/s/[^\"]+)\">([^<]+)</a>")

page = cache.get(url, max_age = -1)
data = page.read()
open("dump", "wb").write(data.encode("utf-8"))

title = titlePattern.findall(data)
title = title[0]

chapter = chapterPattern.match(title)
if chapter != None:
	title = chapter.groups()[0]
	currentChapter = 1

print title

toc = tocStart(title)
Esempio n. 3
0
from sys import argv
from urlgrab import Cache
from codecs import open
import re
from common import *
from urlparse import urljoin

cache = Cache()
url = argv[1]

id = re.search("/works/(\d+)", url)
id = id.groups()[0]

navigate = "http://archiveofourown.org/works/%s/navigate"%id
print navigate

data = cache.get(navigate).read()
data = data.decode("utf-8")
info = re.search("<h2 class=\"heading\">Chapter Index for <a href=\"/works/\d+\">([^<]+)</a> by <a href=\"[^\"]+\" rel=\"author\">([^<]+)</a></h2>", data)
(title, author) = info.groups()

titlePattern = re.compile("<h2 class=\"title heading\">\s+(.*?)\s+</h2>")
summary = re.compile("<div[^>]+?class=\"summary module\"[^>]*?>(.+?)</div>", re.DOTALL|re.MULTILINE)
notes = re.compile("<div.+?class=\"notes module\"[^>]*>(.+?)</div>", re.DOTALL|re.MULTILINE)
mainContent = re.compile("<h3 class=\"landmark heading\" id=\"work\">Chapter Text</h3>(.*?)<!--/main-->", re.DOTALL|re.MULTILINE)
volumePattern = re.compile("<li><a href=\"(/works/\d+/chapters/\d+)\">(\d+). ([^<]+)</a>")

volumes = sorted(volumePattern.findall(data))

print volumes
volumes = dict([(int(x[1]), (x[0],x[2])) for x in volumes])
Esempio n. 4
0
# -*- coding: utf-8 -*-
from urlgrab import Cache
from google.protobuf import text_format
from blog_pb2 import All
from re import compile, DOTALL, MULTILINE
from os.path import exists, join
from codecs import open
from urlparse import urljoin
from optparse import OptionParser
from common import generatePage, tocStart, tocEnd, makeMobi

c = Cache()
c.user_agent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3"

db = All()
text_format.Merge(open("series.txt","rb","utf-8").read(),db)
stripTags = compile("<[^>]+>");
stripAnchorTags = compile("(?:<a[^>]+>)|(?:</a>)");

# Kindle doesn't like various characters, so lets rewrite some of them...
wrong = {
		u"“": u"\"",
		u"’": u"'",
		u"”": u"\"",
		u"‘": u"'",
		u"—": u" - ",
		u"…": u"-",
		u"": u"",
		u'“': u"\"",
		u'”':u"\"",
		u'–':u"-",
Esempio n. 5
0
from common import *
from re import compile, DOTALL, MULTILINE
from urlgrab import Cache
from urlparse import urljoin

linkPattern = compile("<h3><a href=\"(/[^\"]+)\">(.+?)</a></h3>")
earlierPattern = compile("<a href='([^\']+)'>.+?Earlier Stories.+?</a>", DOTALL | MULTILINE)
titlePattern = compile("<h2>(.+?)</h2>")
subtitlePattern = compile("<p class=\"standfirst\">(.+?)</p>")
contentPattern = compile("<strong class=\"trailer\">.+?</p>(.+?)(?:(?:<p>(?:(?:<i>)|(?:<small>)|(?:<font size=\"-2\">)|(?:<br>\n))?BOFH .+? Simon Travaglia)|(?:<ul class=\"noindent\">)|(?:<ul>.+?<li><a href=\"http://www.theregister.co.uk/content/30/index.html\">BOFH: The whole shebang</a></li>)|(?:</form>))", DOTALL| MULTILINE)
adPattern = compile("(<div id=ad-mu1-spot>.+?</div>)", MULTILINE | DOTALL)
episodePattern = compile("<strong class=\"trailer\">Episode \d+")

url = "http://www.theregister.co.uk/data_centre/bofh/"
pages = [url]
cache = Cache()

while True:
	print url
	data = cache.get(url).read()
	links = linkPattern.findall(data)

	if links == []:
		break

	pages.insert(0, url)

	earlier = earlierPattern.findall(data)
	url = urljoin(url, earlier[0])

skipTitles = ["Salmon Days is Go!"]
Esempio n. 6
0
from sys import argv
from urlgrab import Cache
from codecs import open
import re

cache = Cache()
url = argv[1]
data = cache.get(url).read()
open("dump", "wb", "utf-8").write(data)

title = re.search("<title>(.+?) Chapter \d+", data)
title = title.groups()
author = re.search("By:</span> <a[^>]+?href='/u/\d+/[^']+'>([^<]+)</a>", data)
author = author.groups()[0]
id = re.search("/s/(\d+)", url)
id = id.groups()[0]

print """series {
	name: "%s"
	description: "%s"
	author: "%s"
	startPage: "http://m.fanfiction.net/s/%s/1"
	titlePattern: "<img src='/[^']+/balloon.png' class='mt icons'>[\d,]+</a></span>(.+?)<br>"
	contentPattern: "id='storycontent' >(.+?)</div></div>.*?<hr size=1"
	nextPattern: "<a href='(/s/\d+/\d+/)'>Next &#187;</a>"
}"""%(title[0].replace(" ",""), title[0], author, id)

Esempio n. 7
0
#!/usr/bin/python

from urlgrab import Cache
from BeautifulSoup import MinimalSoup as BeautifulSoup
from re import compile
from os.path import exists, getsize, dirname, join
from urllib import urlretrieve, urlencode, quote
from sys import argv
import demjson
import zlib

folder = dirname(argv[0])

cache = Cache(debug=False)

pages = []

for index in range(1,11):
        index = cache.get("http://www.escapistmagazine.com/videos/view/zero-punctuation?page=%d"%index, max_age=60*60*2).read()
        index = index.replace("''>","'>")
        index = BeautifulSoup(index)

        for link in index.findAll("a"):
                if not link.has_key("href"):
                        continue
                if link["href"].find("http://www.escapistmagazine.com/videos/view/zero-punctuation/")!=-1:
                        short_href = link["href"]
                        slash = short_href.rfind("/")
                        if short_href[slash:].find("-")!=-1:
                                short_href = short_href[slash+1:slash+short_href[slash:].find("-")]
                        else: