Example #1
0
def singtao():
    # cons = sqlite3.connect(path + '/singtao.db')
    # cons.text_factory = str
    url = 'http://news.singtao.ca/vancouver/' + datetime.date.today().strftime(
        "%Y-%m-%d") + '/'
    res = httpfetch(url, 'utf-8')
    #    f=file('a.html','r')
    #    res =f.read()
    #    f.close()
    res2 = re.compile(r'>headline(.*?)\.html', re.DOTALL).findall(res)

    for topic in res2:
        web_site = '星島日報'
        if database.find(topic, web_site):
            return
        urlbase = url + 'headline' + topic + '.html'

        try:
            item_page = httpfetch(urlbase, 'utf-8', report=True)
        except Exception:
            print "Unexpected error:", sys.exc_info()[1]

        try:
            title = re.compile(r'<title>(.*?)</title>',
                               re.DOTALL).findall(item_page)[0].split('_')[0]
            content = re.compile(r'<div class="content" id="Zoom">(.*?)</div>',
                                 re.DOTALL).findall(item_page)[0]
            content = re.compile(r'<br />', re.DOTALL).sub('\n', content)
            content = re.compile(r'<.*?>', re.DOTALL).sub('', content)
            content = re.compile(r'&.*?;', re.DOTALL).sub(' ', content)
            content = re.compile(r'\n\s+', re.DOTALL).sub('\n', content)
        # content = content.strip()
        except:
            print "Unexpected error:", sys.exc_info()[1]
            print urlbase
        source = '星島日報'

        post_date = datetime.date.today().strftime("%Y-%m-%d")
        tries = 0
        while tries < 2:
            try:
                if not database.find(topic, web_site):
                    database.insert(topic, title, source, content, post_date,
                                    urlbase, web_site)
                else:
                    continue
            except Exception:
                print urlbase
                print sys.exc_info()[0]
                tries += 1
                time.sleep(10)
                continue
            break
    return
Example #2
0
def feed():
	''' read verycd feed and keep update very 30 min '''
	url = 'http://www.verycd.com/sto/feed'
	print 'fetching feed ...'
	feeds = httpfetch(url)
	ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(feeds)
	ids = set(ids)
	print ids
	now = time.mktime(time.gmtime())
	for id in ids:
		q.put(id)
Example #3
0
def feed():
    """ read verycd feed and keep update very 30 min """
    url = "http://www.verycd.com/sto/feed"
    print "fetching feed ..."
    feeds = httpfetch(url)
    ids = re.compile(r"/topics/(\d+)", re.DOTALL).findall(feeds)
    ids = set(ids)
    print ids
    now = time.mktime(time.gmtime())
    for id in ids:
        q.put(id)
Example #4
0
def feed():
    ''' read verycd feed and keep update very 30 min '''
    url = 'http://www.verycd.com/sto/feed'
    print 'fetching feed ...'
    feeds = httpfetch(url)
    ids = re.compile(r'/topics/(\d+)', re.DOTALL).findall(feeds)
    ids = set(ids)
    print ids
    now = time.mktime(time.gmtime())
    for id in ids:
        q.put(id)
Example #5
0
def request(pages):
	'''fetch request res that need login'''
	if '-' in pages:
		(f,t)=[ int(x) for x in pages.split('-') ]
	else:
		f = t = int(pages)
	for page in range(f,t+1):
		url = 'http://www.verycd.com/orz/page%d?stat=request' % page
		idx = httpfetch(url,needlogin=True)
		ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(idx)
		print ids[0]
		for id in ids:
			q.put(id)
Example #6
0
def hot():
	''' read verycd hot res and keep update very day '''
	url = 'http://www.verycd.com/'
	print 'fetching homepage ...'
	home = httpfetch(url)
	hotzone = re.compile(r'热门资源.*?</dl>',re.DOTALL).search(home).group()
	hot = re.compile(r'<a href="/topics/(\d+)/"[^>]*>(《.*?》)[^<]*</a>',re.DOTALL).findall(hotzone)
	html = '<h2 style="color:red">每日热门资源</h2>\n'
	for topic in hot:
		print 'fetching hot topic',topic[0],'...'
		q.put(topic[0])
		html += '&nbsp;<a target="_parent" href="/?id=%s">%s</a>&nbsp;\n' % topic
	open(path+'/static/hot.html','w').write(html)
Example #7
0
def request(pages):
    """fetch request res that need login"""
    if "-" in pages:
        (f, t) = [int(x) for x in pages.split("-")]
    else:
        f = t = int(pages)
    for page in range(f, t + 1):
        url = "http://www.verycd.com/orz/page%d?stat=request" % page
        idx = httpfetch(url, needlogin=True)
        ids = re.compile(r"/topics/(\d+)", re.DOTALL).findall(idx)
        print ids[0]
        for id in ids:
            q.put(id)
Example #8
0
def hot():
	''' read verycd hot res and keep update very day '''
	url = 'http://www.verycd.com/sto/'
	print 'fetching homepage ...'
	home = httpfetch(url)
	hotzone = re.compile(r'今日热门.*?</dl>',re.DOTALL).search(home).group()
	hot = re.compile(r'<a href="/topics/(\d+)/"[^>]*>(《.*?》)[^<]*</a>',re.DOTALL).findall(hotzone)
	html = '<h2 style="color:red">每日热门资源</h2>\n'
	for topic in hot:
		print 'fetching hot topic',topic[0],'...'
		q.put(topic[0])
		html += '&nbsp;<a target="_parent" href="/?id=%s">%s</a>&nbsp;\n' % topic
	open(path+'/static/hot.html','w').write(html)
Example #9
0
def hot():
    """ read verycd hot res and keep update very day """
    url = "http://www.verycd.com/"
    print "fetching homepage ..."
    home = httpfetch(url)
    hotzone = re.compile(r"热门资源.*?</dl>", re.DOTALL).search(home).group()
    hot = re.compile(r'<a href="/topics/(\d+)/"[^>]*>(《.*?》)[^<]*</a>', re.DOTALL).findall(hotzone)
    html = '<h2 style="color:red">每日热门资源</h2>\n'
    for topic in hot:
        print "fetching hot topic", topic[0], "..."
        q.put(topic[0])
        html += '&nbsp;<a target="_parent" href="/?id=%s">%s</a>&nbsp;\n' % topic
    open(path + "/static/hot.html", "w").write(html)
Example #10
0
def fetchcmt(id,dbc=dbc,debug=False,page=1):
    print 'fetching topic',id,'...'
    urlbase = 'http://www.verycd.com/topics/'
    url = urlbase + str(id) + '/comments/page' + str(page)

    res = ''
    for _ in range(3):
        try:
            res = httpfetch(url,report=True)
            break
        except:
            continue

    if page == 1:
        pages = re.compile(r'/comments/page(\d+)').findall(res)
        if pages:
            pages = set(pages)
            for page in pages:
                if page != 1:
                    fetchcmt(id=id,dbc=dbc,page=page,debug=debug)

    stmts = re.compile(r'<a href="/members/[^>]*>([^<]*)</a>.*?<span class="date-time">(.*?)</span>.*?<!--Wrap-head end-->(.*?)<!--Wrap-tail begin-->',re.DOTALL).findall(res)
    stmts = [ [x[0].replace(r'<.*?>',r'').strip(),x[1].replace(r'<.*?>',r'').strip(),x[2].replace(r'<.*?>',r'').strip()]  for x in stmts]


    for i in range(len(stmts)):
        stmts[i][2] = re.compile(r'(image-\d*)\.verycd\.com',re.I).sub(r'\1.app-base.com',stmts[i][2])
        stmts[i][2] = re.compile(r'<div[^>]*>',re.I).sub(r'',stmts[i][2])
        stmts[i][2] = re.compile(r'</div>',re.I).sub(r'',stmts[i][2])
        stmts[i][2] = re.compile(r'<!--.*-->',re.I).sub(r'',stmts[i][2])
    stmts = [ (int(id),x[0],x[2],int(time.mktime(time.strptime(x[1],'%Y/%m/%d %H:%M:%S')))-8*3600) for x in stmts ]

    if debug:
        print len(stmts)
        for stmt in stmts:
            print stmt[0],stmt[2],stmt[1]

    tries = 0
    while tries<5:
        try:
            c = dbc.cursor()
            c.executemany('replace into comment values (?,?,?,?)',stmts)
            break
        except:
            tries += 1;
            time.sleep(5);            
            continue;
    dbc.commit()
    c.close()
    return
Example #11
0
def update(num=10,off=1):
    urlbase = 'http://www.verycd.com/sto/~all/page'
    for i in range(off,num+1):
        print 'fetching list',i,'...'        
        url = urlbase+str(i)
        res = httpfetch(url,needlogin=True)
        res2 = re.compile(r'"topic-list"(.*?)"pnav"',re.DOTALL).findall(res)
        if res2:
            res2 = res2[0]
        else:
            continue
        topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res2)
        topics = set(topics)
        print topics    
        for topic in topics:
            q.put(topic)
Example #12
0
def update(num=10):
    urlbase = "http://www.verycd.com/sto/~all/page"
    for i in range(1, num + 1):
        print "fetching list", i, "..."
        url = urlbase + str(i)
        res = httpfetch(url)
        res2 = re.compile(r'"topic-list"(.*?)"pnav"', re.DOTALL).findall(res)
        if res2:
            res2 = res2[0]
        else:
            continue
        topics = re.compile(r"/topics/(\d+)", re.DOTALL).findall(res2)
        topics = set(topics)
        print topics
        for topic in topics:
            q.put(topic)
Example #13
0
def update(num=10):
    urlbase = 'http://www.verycd.com/sto/~all/page'
    for i in range(1, num + 1):
        print 'fetching list', i, '...'
        url = urlbase + str(i)
        res = httpfetch(url)
        res2 = re.compile(r'"topic-list"(.*?)"pnav"', re.DOTALL).findall(res)
        if res2:
            res2 = res2[0]
        else:
            continue
        topics = re.compile(r'/topics/(\d+)', re.DOTALL).findall(res2)
        topics = set(topics)
        print topics
        for topic in topics:
            q.put(topic)
Example #14
0
def search(keyword,full=True):
	'''search verycd, fetch search results'''
	url = 'http://www.verycd.com/search/folders/'+keyword
	print 'fetching search results ...'
	res = httpfetch(url)
	topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res)
	topics = set(topics)
	links = []
	if full:
		links = re.compile(r'/search/folders/(.*?\?start=\d+)',re.DOTALL).findall(res)
		print links
	print topics
	if topics:
		for topic in topics:
			q.put(topic)
	if full and links:
		for key in links:
			search(key,full=False)
Example #15
0
def wenxue(num=1):
    urlbase = 'http://news.wenxuecity.com/index.php?page='
    for i in range(1, num + 1):
        print 'fetching wenxue city news on page', i, '...'
        url = urlbase + str(i)
        res = httpfetch(url)
        res2 = re.compile(r'"images/bbslogos/news\.gif"(.*?)"BBSAdd\.php\?SubID=news"', re.DOTALL).findall(res)
        if res2:
            res2 = res2[0]
        else:
            continue

        topics = re.compile(r'messages/(.*?)\.html', re.DOTALL).findall(res2)
        topics = set(topics)

        print topics
        for topic in topics:
            q.put(topic)
Example #16
0
def fetchall(ran='1-max',debug=False):
	urlbase = 'http://www.verycd.com/archives/'
	if ran == '1-max':
		m1 = 1
		res = urllib.urlopen(urlbase).read()
		m2 = int(re.compile(r'archives/(\d+)').search(res).group(1))
	else:
		m = ran.split('-')
		m1 = int(m[0])
		m2 = int(m[1])
	print 'fetching list from',m1,'to',m2,'...'
	for i in range(m1,m2+1):
		url = urlbase + '%05d'%i + '.html'
		print 'fetching from',url,'...'
		res = httpfetch(url)
		ids = re.compile(r'topics/(\d+)/',re.DOTALL).findall(res)
		print ids
		for id in ids:
			q.put(id)
Example #17
0
def fetchall(ran='1-max', debug=False):
    urlbase = 'http://www.verycd.com/archives/'
    if ran == '1-max':
        m1 = 1
        res = urllib.urlopen(urlbase).read()
        m2 = int(re.compile(r'archives/(\d+)').search(res).group(1))
    else:
        m = ran.split('-')
        m1 = int(m[0])
        m2 = int(m[1])
    print 'fetching list from', m1, 'to', m2, '...'
    for i in range(m1, m2 + 1):
        url = urlbase + '%05d' % i + '.html'
        print 'fetching from', url, '...'
        res = httpfetch(url)
        ids = re.compile(r'topics/(\d+)/', re.DOTALL).findall(res)
        print ids
        for id in ids:
            q.put(id)
Example #18
0
def fetchall(ran="1-max", debug=False):
    urlbase = "http://www.verycd.com/archives/"
    if ran == "1-max":
        m1 = 1
        res = urllib.urlopen(urlbase).read()
        m2 = int(re.compile(r"archives/(\d+)").search(res).group(1))
    else:
        m = ran.split("-")
        m1 = int(m[0])
        m2 = int(m[1])
    print "fetching list from", m1, "to", m2, "..."
    for i in range(m1, m2 + 1):
        url = urlbase + "%05d" % i + ".html"
        print "fetching from", url, "..."
        res = httpfetch(url)
        ids = re.compile(r"topics/(\d+)/", re.DOTALL).findall(res)
        print ids
        for id in ids:
            q.put(id)
Example #19
0
def wenxue(num=2):
    urlbase = 'http://www.wenxuecity.com/news/'
    for i in range(1, num + 1):
        # print 'fetching wenxue city news on page', i, '...'
        url = urlbase + "morenews/?page=" + str(i)
        res = httpfetch(url, 'gb2312')
        res2 = re.compile(r'<div class="list" id="contentList">(.*?)<div class="turnpage">', re.DOTALL).findall(res)
        if res2:
            res2 = res2[0]
        else:
            continue
        # 抓取新闻条目的ID
        topics = re.compile(r'<a href="(.*?)" target="_blank">', re.DOTALL).findall(res2)
        topics = set(topics)
        # print topics
        for topic in topics:
            print topic
            q.put(topic)
            fetch(topic)
    return
Example #20
0
def search(keyword, full=True):
    """search verycd, fetch search results"""

    searchlog = path + "/search.log"
    open(searchlog, "a").write("\n" + keyword + "\n")

    url = "http://www.verycd.com/search/folders/" + keyword
    print "fetching search results ..."
    res = httpfetch(url)
    topics = re.compile(r"/topics/(\d+)", re.DOTALL).findall(res)
    topics = set(topics)
    links = []
    if full:
        links = re.compile(r"/search/folders/(.*?\?start=\d+)", re.DOTALL).findall(res)
        print links
    print topics
    if topics:
        for topic in topics:
            open(searchlog, "a").write(topic + ",")
            q.put(topic)
    if full and links:
        for key in links:
            search(key, full=False)
Example #21
0
def fetch(id, conn=conn, debug=False):
    print 'fetching topic', id, '...'
    urlbase = 'http://news.wenxuecity.com/messages/'
    url = urlbase + str(id) + '.html'
    news_id = id.split('-')[2]
    if dbfind(news_id, conn):
        return

    res = ''
    for _ in range(3):
        try:
            res = httpfetch(url, report=True)
            break
        except:
            continue

    title = re.compile(r'<h1 class="cnTitle">(.*?)</h1>', re.DOTALL).findall(res)
    if title:
        title = title[0]
        link = url
        web_site = '文学城'
    else:
        return
    try:
        source = re.compile(r'<span style="color: #006699;">(.*?)</span>', re.DOTALL).search(res).group(1)
        post_date = re.compile(r'#cc3300;">(.*?)</span>', re.DOTALL).search(res).group(1)
        content = re.compile(r'<td valign="top" class="main">(.*?)<div align="right">', re.DOTALL).findall(res)
    except:
        return

    if content:
        content = content[0]
        content = re.compile(r'<br />', re.DOTALL).sub('\n', content)
        content = re.compile(r'<.*?>', re.DOTALL).sub('', content)
        content = re.compile(r'&.*?;', re.DOTALL).sub(' ', content)
        content = re.compile(r'\n\s+', re.DOTALL).sub('\n', content)
        content = content.strip()
    else:
        content = ''

    if debug:
        print title
        print source
        print content
        print post_date
        print web_site

    tries = 0
    while tries < 3:
        try:
            if not dbfind(news_id, conn):
                dbinsert(news_id, title, source, content, post_date, link, web_site, conn)
            else:
                continue
            # dbupdate(news_id,title,source,content,post_date,link,web_site,conn)
            break;
        except:
            print sys.exc_info()[0]
            tries += 1;
            time.sleep(5);
            continue;

    return post_date
Example #22
0
def fetch(id, conn=conn, debug=False):
    print 'fetching topic', id, '...'
    urlbase = 'http://www.verycd.com/topics/'
    url = urlbase + str(id)

    res = ''
    for _ in range(3):
        try:
            res = httpfetch(url, report=True)
            break
        except:
            continue

    abstract = re.compile(r'<h1>.*?visit', re.DOTALL).findall(res)
    if not abstract:
        print res
        if res == '' or '很抱歉' in res:
            print 'resource does not exist'
            return
        else:
            print 'fetching', id, 'again...'
            return fetch(id, conn)
    abstract = abstract[0]
    title = re.compile(r'<h1>(.*?)</h1>', re.DOTALL).findall(abstract)
    if title:
        title = title[0]
    else:
        return
    try:
        status = re.compile(r'"requestWords">(.*?)<',
                            re.DOTALL).search(abstract).group(1)
        brief = re.compile(r'"font-weight:normal"><span>(.*?)</td>',
                           re.DOTALL).search(abstract).group(1)
        brief = re.compile(r'<.*?>', re.DOTALL).sub('', brief).strip()
        pubtime = re.compile(
            r'"date-time">(.*?)</span>.*?"date-time">(.*?)</span>',
            re.DOTALL).findall(abstract)[0]
        category1 = re.compile(r'分类.*?<td>(.*?)&nbsp;&nbsp;(.*?)&nbsp;&nbsp;',
                               re.DOTALL).findall(abstract)[0]
        category = ['', '']
        category[0] = re.compile(r'<.*?>',
                                 re.DOTALL).sub('', category1[0]).strip()
        category[1] = re.compile(r'<.*?>',
                                 re.DOTALL).sub('', category1[1]).strip()

        res2 = re.compile(r'iptcomED2K"><!--eMule.*?<!--eMule end-->',
                          re.DOTALL).findall(res)[0]

        ed2k = re.compile(
            r'ed2k="([^"]*)" subtitle_[^=]*="([^"]*)">([^<]*)</a>',
            re.DOTALL).findall(res2)
        ed2k.extend(
            re.compile(r'ed2k="([^"]*)">([^<]*)</a>', re.DOTALL).findall(res2))

        content = re.compile(r'<!--eMule end-->(.*?)<!--Wrap-tail end-->',
                             re.DOTALL).findall(res)
    except:
        return

    if content:
        content = content[0]
        content = re.compile(r'<br />', re.DOTALL).sub('\n', content)
        content = re.compile(r'<.*?>', re.DOTALL).sub('', content)
        content = re.compile(r'&.*?;', re.DOTALL).sub(' ', content)
        content = re.compile(r'\n\s+', re.DOTALL).sub('\n', content)
        content = content.strip()
    else:
        content = ''

    if debug:
        print title
        print status
        print brief
        print pubtime[0], pubtime[1]
        print category[0], category[1]
        for x in ed2k:
            print x
        print content

    ed2kstr = ''
    for x in ed2k:
        ed2kstr += '`'.join(x) + '`'

    if not dbfind(id, conn):
        dbinsert(id, title, status, brief, pubtime, category, ed2kstr, content,
                 conn)
    else:
        dbupdate(id, title, status, brief, pubtime, category, ed2kstr, content,
                 conn)

    return pubtime[1]
Example #23
0
def fetch(id,db=db,dbl=dbl,dbc=dbc,debug=False):
    print 'fetching topic',id,'...'
    urlbase = 'http://www.verycd.com/topics/'
    url = urlbase + str(id)

    res = ''
    for _ in range(3):
        try:
            res = httpfetch(url,report=True,needlogin=False)
            break
        except:
            continue

    abstract = re.compile(r'<h1>.*?visit',re.DOTALL).findall(res)
    if not abstract:
        print res
        if res == '' or '很抱歉' in res:
            print 'resource does not exist'
            return
        else:
            print 'fetching',id,'again...'
            return fetch(id,db)
    abstract = abstract[0]
    
    title = re.compile(r'<h1>(.*?)</h1>',re.DOTALL).findall(abstract)
    if title:
        title=title[0]
    else:
        return
    try:
        status = re.compile(r'"requestWords">(.*?)<',re.DOTALL).search(abstract).group(1)
        brief = re.compile(r'"font-weight:normal">\s*<span>(.*?)</td>',re.DOTALL).search(abstract).group(1)
        brief = re.compile(r'<.*?>',re.DOTALL).sub('',brief).strip()
        pubtime = re.compile(r'"date-time">(.*?)</span>.*?"date-time">(.*?)</span>',re.DOTALL).findall(abstract)[0]
        category1 = re.compile(r'<strong>分类.*?<td>(.*?)&nbsp;&nbsp;(.*?)&nbsp;&nbsp;',re.DOTALL).findall(abstract)[0]
        category = ['','']
        category[0] = re.compile(r'<.*?>',re.DOTALL).sub('',category1[0]).strip()
        category[1] = re.compile(r'<.*?>',re.DOTALL).sub('',category1[1]).strip()
    
        ed2k = re.compile(r'ed2k="([^"]*)" (subtitle_[^=]*="[^"]*"[^>]*)>([^<]*)</a>',re.DOTALL).findall(res)
        ed2k.extend( re.compile(r'ed2k="([^"]*)">([^<]*)</a>',re.DOTALL).findall(res) )
        #delete duplicates
        newed2k = ed2k
        for i in range(len(ed2k)-1,-1,-1):
            if ed2k[i] in ed2k[:i]:
                newed2k.remove(ed2k[i])
        content = re.compile(r'id="iptcomContents">(.*?)<!--Wrap-tail end-->',re.DOTALL).findall(res)
    except:
        return

    if content:
        content = content[0]
        content = re.compile(r'<(img .*?)>').sub(r'[\1]',content)
        content = re.compile(r'<br />',re.DOTALL).sub('\n',content)
        content = re.compile(r'<.*?>',re.DOTALL).sub('',content)
        content = re.compile(r'&.*?;',re.DOTALL).sub(' ',content)
        content = re.compile(r'\n\s+',re.DOTALL).sub('\n',content)
        content = re.compile(r'\[(img .*?)\]').sub(r'<\1><br>',content)
        content = re.compile(r'(image-\d*)\.verycd\.com',re.I).sub(r'\1.app-base.com',content)
        content = content.strip()
    else:
        content=''
    
    vcpv = 0

    #fetch stat
    try:
        staturl = 'http://stat.verycd.com/counters/folder/'+str(id)+'/'
        st = httpfetch(staturl)
        vcpv = int(re.compile(r'\'(\d+)\'').findall(st)[0])
    except:
        pass

    # update lock
    owner = re.compile(r'<div id="userres">.*?<td align="left" valign="top"><p><strong id="username"><a href=.*?>(.*)</a></strong>',re.DOTALL).findall(res)
    if owner:
        owner = owner[0]
        cl=dbl.cursor()
        try:
            cl.execute('replace into lock values (?,?,?,?,?,?,?)',(long(id),True,owner,'',title,pubtime[1],vcpv))
        except:
            pass
        while True:
            try:
                dbl.commit()
                break
            except:
                pass
        cl.close()

    if debug:
        if vcpv:
            print vcpv
        if owner:
            print owner
        print title
        print status
        print brief
        print pubtime[0],pubtime[1]
        print category[0],category[1]
        for x in ed2k:
            print x
        print content

    ed2kstr = ''
    for x in ed2k:
        ed2kstr += '`'.join(x)+'`'

    if ed2kstr == '':
        return

    # update verycd
    try:
        if not dbfind(id,db):
            dbinsert(id,title,status,brief,pubtime,category,ed2kstr,content,db)
        else:
            dbupdate(id,title,status,brief,pubtime,category,ed2kstr,content,db)
    except Exception as what:
        print what

    # update comment
    fetchcmt(id=id,dbc=dbc)

    return pubtime[1]
Example #24
0
def fetch(id, conn=conn, debug=False):
    print "fetching topic", id, "..."
    urlbase = "http://www.verycd.com/topics/"
    url = urlbase + str(id)

    res = ""
    for _ in range(3):
        try:
            res = httpfetch(url, report=True)
            break
        except:
            continue

    abstract = re.compile(r"<h1>.*?visit", re.DOTALL).findall(res)
    if not abstract:
        print res
        if res == "" or "很抱歉" in res:
            print "resource does not exist"
            return
        else:
            print "fetching", id, "again..."
            return fetch(id, conn)
    abstract = abstract[0]

    title = re.compile(r"<h1>(.*?)</h1>", re.DOTALL).findall(abstract)
    if title:
        title = title[0]
    else:
        return
    try:
        status = re.compile(r'"requestWords">(.*?)<', re.DOTALL).search(abstract).group(1)
        brief = re.compile(r'"font-weight:normal"><span>(.*?)</td>', re.DOTALL).search(abstract).group(1)
        brief = re.compile(r"<.*?>", re.DOTALL).sub("", brief).strip()
        pubtime = re.compile(r'"date-time">(.*?)</span>.*?"date-time">(.*?)</span>', re.DOTALL).findall(abstract)[0]
        category1 = re.compile(r"分类.*?<td>(.*?)&nbsp;&nbsp;(.*?)&nbsp;&nbsp;", re.DOTALL).findall(abstract)[0]
        category = ["", ""]
        category[0] = re.compile(r"<.*?>", re.DOTALL).sub("", category1[0]).strip()
        category[1] = re.compile(r"<.*?>", re.DOTALL).sub("", category1[1]).strip()

        # 		res2 = re.compile(r'iptcomED2K"><!--eMule.*?<!--eMule end-->',re.DOTALL).findall(res)[0]

        ed2k = re.compile(r'ed2k="([^"]*)" (subtitle_[^=]*="[^"]*"[^>]*)>([^<]*)</a>', re.DOTALL).findall(res)
        ed2k.extend(re.compile(r'ed2k="([^"]*)">([^<]*)</a>', re.DOTALL).findall(res))

        content = re.compile(r"<!--eMule end-->(.*?)<!--Wrap-tail end-->", re.DOTALL).findall(res)
    except:
        return

    if content:
        content = content[0]
        content = re.compile(r"<br />", re.DOTALL).sub("\n", content)
        content = re.compile(r"<.*?>", re.DOTALL).sub("", content)
        content = re.compile(r"&.*?;", re.DOTALL).sub(" ", content)
        content = re.compile(r"\n\s+", re.DOTALL).sub("\n", content)
        content = content.strip()
    else:
        content = ""

    if debug:
        print title
        print status
        print brief
        print pubtime[0], pubtime[1]
        print category[0], category[1]
        for x in ed2k:
            print x
        print content

    ed2kstr = ""
    for x in ed2k:
        ed2kstr += "`".join(x) + "`"
    tries = 0
    while tries < 3:
        try:
            if not dbfind(id, conn):
                dbinsert(id, title, status, brief, pubtime, category, ed2kstr, content, conn)
            else:
                dbupdate(id, title, status, brief, pubtime, category, ed2kstr, content, conn)
            break
        except:
            tries += 1
            time.sleep(5)
            continue

    return pubtime[1]
Example #25
0
def fetch(id,conn=conn,debug=False):
	print 'fetching topic',id,'...'
	urlbase = 'http://www.verycd.com/topics/'
	url = urlbase + str(id)

	res = ''
	for _ in range(3):
		try:
			res = httpfetch(url,report=True)
			break
		except:
			continue

	abstract = re.compile(r'<h1>.*?visit',re.DOTALL).findall(res)
	if not abstract:
		print res
		if res == '' or '很抱歉' in res:
			print 'resource does not exist'
			return
		else:
			print 'fetching',id,'again...'
			return fetch(id,conn)
	abstract = abstract[0]
    

	title = re.compile(r'<h1>(.*?)</h1>',re.DOTALL).findall(abstract)[0]
	status = re.compile(r'"requestWords">(.*?)<',re.DOTALL).search(abstract).group(1)
	brief = re.compile(r'"font-weight:normal"><span>(.*?)</td>',re.DOTALL).search(abstract).group(1)
	brief = re.compile(r'<.*?>',re.DOTALL).sub('',brief).strip()
	pubtime = re.compile(r'"date-time">(.*?)</span>.*?"date-time">(.*?)</span>',re.DOTALL).findall(abstract)[0]
	category1 = re.compile(r'分类.*?<td>(.*?)&nbsp;&nbsp;(.*?)&nbsp;&nbsp;',re.DOTALL).findall(abstract)[0]
	category = ['','']
	category[0] = re.compile(r'<.*?>',re.DOTALL).sub('',category1[0]).strip()
	category[1] = re.compile(r'<.*?>',re.DOTALL).sub('',category1[1]).strip()

	res2 = re.compile(r'iptcomED2K"><!--eMule.*?<!--eMule end-->',re.DOTALL).findall(res)[0]

	ed2k = re.compile(r'ed2k="([^"]*)" subtitle_[^=]*="([^"]*)">([^<]*)</a>',re.DOTALL).findall(res2)
	ed2k.extend( re.compile(r'ed2k="([^"]*)">([^<]*)</a>',re.DOTALL).findall(res2) )

	content = re.compile(r'<!--eMule end-->(.*?)<!--Wrap-tail end-->',re.DOTALL).findall(res)

	if content:
		content = content[0]
		content = re.compile(r'<br />',re.DOTALL).sub('\n',content)
		content = re.compile(r'<.*?>',re.DOTALL).sub('',content)
		content = re.compile(r'&.*?;',re.DOTALL).sub(' ',content)
		content = re.compile(r'\n\s+',re.DOTALL).sub('\n',content)
		content = content.strip()
	else:
		content=''

	if debug:
		print title
		print status
		print brief
		print pubtime[0],pubtime[1]
		print category[0],category[1]
		for x in ed2k:
			print x
		print content

	ed2kstr = ''
	for x in ed2k:
		ed2kstr += '`'.join(x)+'`'

	if not dbfind(id,conn):
		dbinsert(id,title,status,brief,pubtime,category,ed2kstr,content,conn)
	else:
		dbupdate(id,title,status,brief,pubtime,category,ed2kstr,content,conn)

	return pubtime[1]
Example #26
0
def fetch(i, debug=False):
    path = os.path.dirname(os.path.realpath(sys.argv[0]))
    conn = sqlite3.connect(path + '/news.sqlite3.db')
    conn.text_factory = str
    # print 'fetching topic', i, '...'
    urlbase = 'http://www.wenxuecity.com'
    url = urlbase + i
    news_id = i.split('/')[5]
    news_id = news_id.split('.')[0]
    w = "文学城"
    # if database.find(news_id, w, conn):
    #     return
    res = ''
    for _ in range(3):
        try:
            res = httpfetch(url, 'utf-8', report=True)
            break
        except:
            print sys.exc_info()[1]
            continue
    res = re.compile(r'<div class="maincontainer">(.*?)<div class="banners">',
                     re.DOTALL).findall(res)[0]
    title = re.compile(r'<h3>(.*?)</h3>', re.DOTALL).findall(res)
    if title:
        title = title[0].encode('utf-8')
        link = url
        web_site = '文学城'
        try:
            parse = re.compile(r'<div id="postmeta">(.*?) <span>',
                               re.DOTALL).search(res).group(1)
            source = re.compile(r'itemprop="author">(.*?)</span>',
                                re.DOTALL).findall(parse)[0]
            post_date = re.compile(r'datetime(.*?)</time>',
                                   re.DOTALL).findall(parse)[0]
            post_date = post_date.split('>')[1]
            content = re.compile(
                r'<div id="articleContent" class="article">(.*?)<div class="sharewechat">',
                re.DOTALL).findall(res)[0]
            if content:
                # content = content[0]
                content = re.compile(r'<div style=(.*?)>',
                                     re.DOTALL).sub('', content)
                content = re.compile(r'<br>', re.DOTALL).sub('', content)
                content = re.compile(r'<.*?>', re.DOTALL).sub('', content)
                content = re.compile(r'&.*?;', re.DOTALL).sub(' ', content)
                content = re.compile(r'\n\s+', re.DOTALL).sub('\n', content)
                content = content.strip()
            else:
                content = ''
                print news_id

            if debug:
                print title
                print source
                print content
                print post_date
                print web_site

            if not database.find(news_id, web_site, conn):
                database.insert(news_id, title, source, content, post_date,
                                link, web_site, conn)
            else:
                database.update(news_id, title, source, content, post_date,
                                link, web_site, conn)
        except:
            print ""
    return post_date
Example #27
0
	def run(self):
		for i in range(8):
			t = Thread(target=self.thread_fetch)
			t.setDaemon(True)
			t.start()

		conn = sqlite3.connect(self.path+'/verycd.sqlite3.db')
		conn.text_factory = str
		while True:
			try:
				#feed
				if time.mktime(time.gmtime())%60<10:
					self.q.put('feed')
				#check searchqueue every 10 secs
				taskqueue = open(self.path+'/searchqueue','r').readlines()
				print taskqueue,time.mktime(time.gmtime()),time.mktime(time.gmtime())%900
				open(self.path+'/searchqueue','w').write('')
				for task in taskqueue:
					url = 'http://www.verycd.com/search/folders/'+task
					print 'fetching', url, '...'
					res = httpfetch(url)
					print '...fetching completed'
					topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res)
					topics = set(topics)
					for topic in topics:
						self.q.put(topic)
				if taskqueue == []:
					time.sleep(10)
				# read feed every 900 secs
				if time.mktime(time.gmtime())%600<10:
					url = 'http://www.verycd.com/sto/feed'
					print 'fetching feed ...'
					feeds = httpfetch(url)
					topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(feeds)
					topics = set(topics)
					print topics
					now = time.mktime(time.gmtime())
					for topic in topics:
						self.q.put(topic)
				# read hot everyday at gmt 19:00
				# read hot every 4 hours
				timeofday =  time.mktime(time.gmtime())%(86400/6)
#				if timeofday>68400 and timeofday < 68410:
				if time.mktime(time.gmtime())%(3600*4)<10:
					url = 'http://www.verycd.com/'
					print 'fetching homepage ...'
					home = httpfetch(url)
					hotzone = re.compile(r'热门资源.*?</dl>',re.DOTALL).search(home).group()
					hot = re.compile(r'<a href="/topics/(\d+)/"[^>]*>(《.*?》)[^<]*</a>',re.DOTALL).findall(hotzone)
					html = '<h2 style="color:red">每日热门资源</h2>\n'
					for topic in hot:
						print 'fetching hot topic',topic[0],'...'
						self.q.put(topic[0])
						html += '&nbsp;<a target="_parent" href="/?id=%s">%s</a>&nbsp;\n' % topic
					open(self.path+'/static/hot.html','w').write(html)
				# update 20 whole pages at gmt 19:10
				if timeofday>69000 and timeofday < 69010:
					urlbase = 'http://www.verycd.com/sto/~all/page'
					for i in range(1,20):
						print 'fetching list',i,'...'		
						url = urlbase+str(i)
						res = httpfetch(url)
						res2 = re.compile(r'"topic-list"(.*?)"pnav"',re.DOTALL).findall(res)
						if res2:
							res2 = res2[0]
						else:
							continue
						topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res2)
						topics = set(topics)
						print topics	
						for topic in topics:
							self.q.put(topic)
				# update 1 pages@normal and 1 pages@request every 3600 secs
				if time.mktime(time.gmtime())%3600<10:
					url = 'http://www.verycd.com/orz/page1?stat=normal'
					idx = httpfetch(url,needlogin=True)
					ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(idx)
					print ids[0]
					for id in ids:
						self.q.put(id)
					url = 'http://www.verycd.com/orz/page1?stat=request'
					idx = httpfetch(url,needlogin=True)
					ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(idx)
					print ids[0]
					for id in ids:
						self.q.put(id)
			except:
				time.sleep(10)
				continue
Example #28
0
def fetch(id,conn=conn,debug=False):
	print 'fetching topic',id,'...'
	urlbase = 'http://www.verycd.com/topics/'
	url = urlbase + str(id)

	res = ''
	for _ in range(3):
		try:
			res = httpfetch(url,report=True)
			break
		except:
			continue

	abstract = re.compile(r'<h1.*?</ul>',re.DOTALL).findall(res)
	if not abstract:
		print res
		if res == '' or '很抱歉' in res:
			print 'resource does not exist'
			return
		else:
			print 'fetching',id,'again...'
			return fetch(id,conn)
	abstract = abstract[0]
    
	title = re.compile(r'<h1.*?</h1>',re.DOTALL).findall(abstract)
	if title:
		title = title[0]
		title = re.compile(r'<.*?>',re.DOTALL).sub('',title).strip()
	else:
		return
	try:
		status = re.compile(r'状态.*?<span>(.*?)</span>.*?</li>',re.DOTALL).search(abstract).group(1)
		brief = re.compile(r'摘要.*?<span>(.*?)</li>',re.DOTALL).search(abstract).group(1)
		brief = re.compile(r'<.*?>',re.DOTALL).sub('',brief).strip()
		pubtime = re.compile(r'date-time.*?>(.*?)</span>.*?date-time.*?>(.*?)</span>',re.DOTALL).findall(abstract)[0]
		category1 = re.compile(r'分类.*?<a.*?>(.*?)</a>.*?<a.*?>(.*?)</a>',re.DOTALL).findall(abstract)[0]
		category = ['','']
		category[0] = re.compile(r'<.*?>',re.DOTALL).sub('',category1[0]).strip()
		category[1] = re.compile(r'<.*?>',re.DOTALL).sub('',category1[1]).strip()
		print category
	
#		res2 = re.compile(r'iptcomED2K"><!--eMule.*?<!--eMule end-->',re.DOTALL).findall(res)[0]
	
		ed2k = re.compile(r'ed2k="([^"]*)" (subtitle_[^=]*="[^"]*"[^>]*)>([^<]*)</a>',re.DOTALL).findall(res)
		ed2k.extend( re.compile(r'ed2k="([^"]*)">([^<]*)</a>',re.DOTALL).findall(res) )
	
		content = re.compile(r'<!--eMule end-->(.*?)<!--Wrap-tail end-->',re.DOTALL).findall(res)
	except:
		return

	if content:
		content = content[0]
		content = re.compile(r'<br />',re.DOTALL).sub('\n',content)
		content = re.compile(r'<.*?>',re.DOTALL).sub('',content)
		content = re.compile(r'&.*?;',re.DOTALL).sub(' ',content)
		content = re.compile(r'\n\s+',re.DOTALL).sub('\n',content)
		content = content.strip()
	else:
		content=''

	if debug:
		print title
		print status
		print brief
		print pubtime[0],pubtime[1]
		print category[0],category[1]
		for x in ed2k:
			print x
		print content

	ed2kstr = ''
	for x in ed2k:
		ed2kstr += '`'.join(x)+'`'
	tries=0
	while tries<3:
		try:
			if not dbfind(id,conn):
				dbinsert(id,title,status,brief,pubtime,category,ed2kstr,content,conn)
			else:
				dbupdate(id,title,status,brief,pubtime,category,ed2kstr,content,conn)
			break;
		except:
			tries += 1;
			time.sleep(5);			
			continue;

	return pubtime[1]
Example #29
0
def fetch(i, debug=False):
    # path = os.path.dirname(os.path.realpath(sys.argv[0]))
    # conn = sqlite3.connect(path + '/news.sqlite3.db')
    # conn.text_factory = str
    # print 'fetching topic', i, '...'
    urlbase = 'http://www.wenxuecity.com'
    url = urlbase + i
    news_id = i.split('/')[5]
    news_id = news_id.split('.')[0]
    w = "文学城"
    # if database.find(news_id, w, conn):
    #     return
    res = ''
    for _ in range(3):
        try:
            res = httpfetch(url, 'utf-8', report=True)
            break
        except:
            print sys.exc_info()[1]
            continue
    try:
        res = re.compile(r'<div class="maincontainer">(.*?)<div class="sharewechat">', re.DOTALL).findall(res)[0]
        title = re.compile(r'<h3>(.*?)</h3>', re.DOTALL).findall(res)
    except Exception as e:
        print e
        print url
    if title:
        title = html_decode(title[0].encode('utf-8'))
        link = url
        web_site = '文学城'
        try:
            parse = re.compile(r'<div id="postmeta">(.*?) <span>', re.DOTALL).search(res).group(1)
            source = re.compile(r'itemprop="author">(.*?)</span>', re.DOTALL).findall(parse)[0]
            post_date = re.compile(r'datetime(.*?)</time>', re.DOTALL).findall(parse)[0]
            post_date = post_date.split('>')[1]
            content = re.compile(r'<div id="articleContent" class="article">(.*?)<iframe', re.DOTALL).findall(res)[0]
            if content:
                # content = content[0]
                content = re.compile(r'<div style=(.*?)>', re.DOTALL).sub('', content)
                content = re.compile(r'<br>', re.DOTALL).sub('', content)
                content = re.compile(r'<.*?>', re.DOTALL).sub('', content)
                content = re.compile(r'&.*?;', re.DOTALL).sub(' ', content)
                content = re.compile(r'\n\s+', re.DOTALL).sub('\n', content)
                content = content.strip()
            else:
                content = ''
                print news_id

            if debug:
                print title
                print source
                print content
                print post_date
                print web_site
            n = {
                "news_id": news_id,
                "title": title,
                "content": html_decode(content),
                "source": source,
                "link": link,
                "post_date": post_date
            }
            uri = 'http://' + HOST_NAME + '/api/wenxue'
            headers = {"Content-Type": "application/json"}
            r = requests.post(uri, json=n, headers=headers)
            print r.text
            # if not database.find(news_id, web_site, conn):
            #     database.insert(news_id, title, source, content, post_date, link, web_site, conn)
            # else:
            #     database.update(news_id, title, source, content, post_date, link, web_site, conn)
        except Exception, e:
            print "Failed with:", title
            print e