# logging logging.basicConfig (format="%(asctime)s: %(message)s", level=logging.INFO) db_path = "/mnt/heap/misc/ljblogs/%s" % ljname #db_path = "db/%s" % ljname max_page = 20 blog_url = "http://%s.livejournal.com" % ljname blog_db = db.BlogDB (db_path) blog_db.load () # parse all pages for idx in range (0, max_page+1): data = web.wget ("http://%s.livejournal.com/?skip=%d" % (ljname, idx*10)) idx_parser = web.LJIndexParser (data) if len (idx_parser.links) == 0: break for url in idx_parser.links: if url in blog_db.meta: logging.info ("Post %s is already in DB, skip" % url) else: logging.info ("Process post %s" % url) data = web.wget (url) a_parser = web.ArticleParser (data.decode ("utf-8"), blog_url) date = dateutil.parser.parse (a_parser.date) images = [img.encode ('utf-8') for img in a_parser.images.keys ()] me = db.MetaEntry (date, a_parser.title.encode ('utf-8'), url, images)
#db_path = "db/%s" % ljname blog_url = "http://%s.livejournal.com" % ljname blog_db = db.BlogDB (db_path) blog_db.load () blog = index.LJIndex (ljname) # parse all pages for str_date, url in blog.all (): if url in blog_db.meta: logging.info ("Post %s is already in DB, skip" % url) else: logging.info ("Process post %s" % url) data = web.wget (url) a_parser = web.ArticleParser (data.decode ("utf-8"), blog_url) date = dateutil.parser.parse (str_date) images = [img.encode ('utf-8') for img in a_parser.images.keys ()] me = db.MetaEntry (date, a_parser.title.encode ('utf-8'), url, images) logging.info ("Process %d images" % len (a_parser.images)) for dest, src in a_parser.images.iteritems (): try: image_data = web.wget (src.encode ('utf-8')) blog_db.add_image (dest.encode ('utf-8'), image_data) except IOError: pass # add to meta last
from lib import index from lib import web import dateutil.parser blog = index.LJIndex ("polustanok") #for url in blog.all (): # print url #for date, url in blog._posts (2011, 11): # print date, url data = web.wget ("http://pesen-net.livejournal.com/70540.html") a_parser = web.ArticleParser (data.decode ("utf-8"), "http://pesen-net.livejournal.com") print a_parser.title print a_parser.images print a_parser.text
# setup socks proxy #socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS4, "127.0.0.1", 9050) #socket.socket = socks.socksocket # logging logging.basicConfig (format="%(asctime)s: %(message)s", level=logging.INFO) db_path = "/mnt/heap/misc/sgolub" #db_path = "db" blog_db = db.BlogDB (db_path) blog_db.load () # parse all pages for idx in range (0, 2): data = web.wget ("http://sgolub.ru/protograf?page=%d" % idx) pg_parser = web.ProtografParser (data) for u in pg_parser.links: url = "http://sgolub.ru%s" % u if url in blog_db.meta: logging.info ("Post %s is already in DB, skip" % url) else: logging.info ("Process post %s" % url) data = web.wget (url) a_parser = web.ArticleParser (data.decode ("utf-8")) date = web.parse_date (a_parser.date) images = [img.encode ('utf-8') for img in a_parser.images.keys ()] me = db.MetaEntry (date, a_parser.title.encode ('utf-8'), url, images) logging.info ("Process %d images" % len (a_parser.images))