Exemple #1
0
# logging
logging.basicConfig (format="%(asctime)s: %(message)s", level=logging.INFO)

db_path = "/mnt/heap/misc/ljblogs/%s" % ljname
#db_path = "db/%s" % ljname
max_page = 20

blog_url = "http://%s.livejournal.com" % ljname

blog_db = db.BlogDB (db_path)
blog_db.load ()

# parse all pages
for idx in range (0, max_page+1):
    data = web.wget ("http://%s.livejournal.com/?skip=%d" % (ljname, idx*10))
    idx_parser = web.LJIndexParser (data)

    if len (idx_parser.links) == 0:
        break

    for url in idx_parser.links:
        if url in blog_db.meta:
            logging.info ("Post %s is already in DB, skip" % url)
        else:
            logging.info ("Process post %s" % url)
            data = web.wget (url)
            a_parser = web.ArticleParser (data.decode ("utf-8"), blog_url)
            date = dateutil.parser.parse (a_parser.date)
            images = [img.encode ('utf-8') for img in a_parser.images.keys ()]
            me = db.MetaEntry (date, a_parser.title.encode ('utf-8'), url, images)
Exemple #2
0
#db_path = "db/%s" % ljname

blog_url = "http://%s.livejournal.com" % ljname

blog_db = db.BlogDB (db_path)
blog_db.load ()

blog = index.LJIndex (ljname)

# parse all pages
for str_date, url in blog.all ():
    if url in blog_db.meta:
        logging.info ("Post %s is already in DB, skip" % url)
    else:
        logging.info ("Process post %s" % url)
        data = web.wget (url)
        a_parser = web.ArticleParser (data.decode ("utf-8"), blog_url)
        date = dateutil.parser.parse (str_date)
        images = [img.encode ('utf-8') for img in a_parser.images.keys ()]
        me = db.MetaEntry (date, a_parser.title.encode ('utf-8'), url, images)

        logging.info ("Process %d images" % len (a_parser.images))

        for dest, src in a_parser.images.iteritems ():
            try:
                image_data = web.wget (src.encode ('utf-8'))
                blog_db.add_image (dest.encode ('utf-8'), image_data)
            except IOError:
                pass

        # add to meta last
Exemple #3
0
from lib import index
from lib import web
import dateutil.parser


blog = index.LJIndex ("polustanok")

#for url in blog.all ():
#    print url

#for date, url in blog._posts (2011, 11):
#    print date, url

data = web.wget ("http://pesen-net.livejournal.com/70540.html")
a_parser = web.ArticleParser (data.decode ("utf-8"), "http://pesen-net.livejournal.com")
print a_parser.title
print a_parser.images
print a_parser.text

Exemple #4
0
# setup socks proxy
#socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS4, "127.0.0.1", 9050)
#socket.socket = socks.socksocket

# logging
logging.basicConfig (format="%(asctime)s: %(message)s", level=logging.INFO)

db_path = "/mnt/heap/misc/sgolub"
#db_path = "db"

blog_db = db.BlogDB (db_path)
blog_db.load ()

# parse all pages
for idx in range (0, 2):
    data = web.wget ("http://sgolub.ru/protograf?page=%d" % idx)
    pg_parser = web.ProtografParser (data)
    for u in pg_parser.links:
        url = "http://sgolub.ru%s" % u
        if url in blog_db.meta:
            logging.info ("Post %s is already in DB, skip" % url)
        else:
            logging.info ("Process post %s" % url)
            data = web.wget (url)
            a_parser = web.ArticleParser (data.decode ("utf-8"))
            date = web.parse_date (a_parser.date)

            images = [img.encode ('utf-8') for img in a_parser.images.keys ()]
            me = db.MetaEntry (date, a_parser.title.encode ('utf-8'), url, images)

            logging.info ("Process %d images" % len (a_parser.images))