Beispiel #1
0
 def poll(self, timeout=0.1):
     # sanity checks
     now = time.time()
     for channel in asyncore.socket_map.values():
         if channel.bytes_in > self.max_size:
             channel.close()  # too much data
             try:
                 channel.consumer.http(
                     0, channel, ("HTTPManager", "too much data", None))
             except:
                 pass
         if channel.timestamp and now - channel.timestamp > self.max_time:
             channel.close()  # too slow
             try:
                 channel.consumer.http(0, channel,
                                       ("HTTPManager", "timeout", None))
             except:
                 pass
     # activate up to max_connections channels
     while self.queue and len(asyncore.socket_map) < self.max_connections:
         http_client.do_request(*self.queue.pop(0))
     # keep the network running
     asyncore.poll(timeout)
     # return non-zero if we should keep on polling
     return len(self.queue) or len(asyncore.socket_map)
    def http_header(self, client):
        self.host = client.host
        self.header = client.header
        self.path = client.path
        self.data = ""
        header = {}
        header = dict([(l[0], l[1]) for l in client.header.items()])
        print "RECURSE GET",self.host + self.path ,client.status[0] , client.status[1]
        if (client.status[1] == "301" or client.status[1] == "302"):
	    http_client.do_request(header['location'], lj_parser_recurse())
    def http_header(self, client):
        self.host = client.host
        self.header = client.header
        self.path = client.path
        self.data = ""
        header = {}
        header = dict([(l[0], l[1]) for l in client.header.items()])
        if (client.status[1] == "301"):
	    http_client.do_request(header['location'], parser())
	else:
    		print self.host + self.path +" "+ client.status[0] + " " + client.status[1]
    def close(self): 
#	print re.findall("""id="next_page" href="/top/lj/?page=(\d+)""",self.data);
	#next=re.findall("""id="next_page" href="/top/lj/\?page=(\d+)""",self.data);
	self.data=unicode(self.data,"cp1251").encode("utf-8")
	logins=re.findall("""<a href="http://([^/]+)/" title="[^"]+">[^<]+</a>""",self.data)
	for login in logins:
	    http_client.do_request("http://"+login+"/data/foaf", lj_parser())
#	http_client.do_request("http://"+logins[0]+"/data/foaf", lj_parser())
	next=re.findall("""id="next_page" href="/top/lj/\?page=(\d+)""",self.data);
	if(next):
	    http_client.do_request(URLS[0]+"?page="+next[0], parser())
	return 
def getit():
    while len(ljpoolsize)<200 and len(ljquery) >0:
	url=ljquery.pop()
	#print "do request",url
	http_client.do_request(url, lj_parser_recurse())
	ljpoolsize.append(url)
#    print row[1]    
    
writer = csv.writer(open(filename, "ab"),dialect=csv.excel)
def write():
    writer.writerows(ljpool)
    del ljpool[0:len(ljpool)]

def getit():
    while len(ljpoolsize)<200 and len(ljquery) >0:
	url=ljquery.pop()
	#print "do request",url
	http_client.do_request(url, lj_parser_recurse())
	ljpoolsize.append(url)


http_client.do_request("http://shared_lj.livejournal.com/data/foaf", lj_parser_recurse())
asyncore.loop()


import atomstream
import feedparser
from itertools import imap



while 1:
    for update in imap(feedparser.parse, atomstream.connect()):
        login=re.findall("http://(.+)\.livejournal\.com",update.feed.link)
        if login and login[0] != "community" and  (ljuser.get(login[0],None) ==  None):
    	    ljuser[login[0]]=1
            http_client.do_request("http://"+login[0]+".livejournal.com/data/foaf", lj_parser_recurse())
	date=re.findall("""<foaf:dateOfBirth>([^<]+)</foaf:dateOfBirth>""",self.data)
	posted=re.findall("""<ya:posted>([^<]+)</ya:posted>""",self.data)

	if nick:
	    ljuser.append((nick and nick[0] or "",ljnick and ljnick[0] or "",name and name[0] or "",img and img[0] or "",date and date[0] or "",posted and posted[0] or ""))

	if(len(ljuser) >= 50):
	    writer.writerows(ljuser)
	    for i in range(0,len(ljuser)):
		ljuser.pop()


	return 



URLS = ("http://blog.yandex.ru/top/lj/",)

filename = "ljlogin.txt"
writer = csv.writer(open(filename, "wb"),dialect=csv.excel)


for url in URLS:
    http_client.do_request(url, parser())

asyncore.loop()

writer.writerows(ljuser)