def poll(self, timeout=0.1): # sanity checks now = time.time() for channel in asyncore.socket_map.values(): if channel.bytes_in > self.max_size: channel.close() # too much data try: channel.consumer.http( 0, channel, ("HTTPManager", "too much data", None)) except: pass if channel.timestamp and now - channel.timestamp > self.max_time: channel.close() # too slow try: channel.consumer.http(0, channel, ("HTTPManager", "timeout", None)) except: pass # activate up to max_connections channels while self.queue and len(asyncore.socket_map) < self.max_connections: http_client.do_request(*self.queue.pop(0)) # keep the network running asyncore.poll(timeout) # return non-zero if we should keep on polling return len(self.queue) or len(asyncore.socket_map)
def http_header(self, client): self.host = client.host self.header = client.header self.path = client.path self.data = "" header = {} header = dict([(l[0], l[1]) for l in client.header.items()]) print "RECURSE GET",self.host + self.path ,client.status[0] , client.status[1] if (client.status[1] == "301" or client.status[1] == "302"): http_client.do_request(header['location'], lj_parser_recurse())
def http_header(self, client): self.host = client.host self.header = client.header self.path = client.path self.data = "" header = {} header = dict([(l[0], l[1]) for l in client.header.items()]) if (client.status[1] == "301"): http_client.do_request(header['location'], parser()) else: print self.host + self.path +" "+ client.status[0] + " " + client.status[1]
def close(self): # print re.findall("""id="next_page" href="/top/lj/?page=(\d+)""",self.data); #next=re.findall("""id="next_page" href="/top/lj/\?page=(\d+)""",self.data); self.data=unicode(self.data,"cp1251").encode("utf-8") logins=re.findall("""<a href="http://([^/]+)/" title="[^"]+">[^<]+</a>""",self.data) for login in logins: http_client.do_request("http://"+login+"/data/foaf", lj_parser()) # http_client.do_request("http://"+logins[0]+"/data/foaf", lj_parser()) next=re.findall("""id="next_page" href="/top/lj/\?page=(\d+)""",self.data); if(next): http_client.do_request(URLS[0]+"?page="+next[0], parser()) return
def getit(): while len(ljpoolsize)<200 and len(ljquery) >0: url=ljquery.pop() #print "do request",url http_client.do_request(url, lj_parser_recurse()) ljpoolsize.append(url)
# print row[1] writer = csv.writer(open(filename, "ab"),dialect=csv.excel) def write(): writer.writerows(ljpool) del ljpool[0:len(ljpool)] def getit(): while len(ljpoolsize)<200 and len(ljquery) >0: url=ljquery.pop() #print "do request",url http_client.do_request(url, lj_parser_recurse()) ljpoolsize.append(url) http_client.do_request("http://shared_lj.livejournal.com/data/foaf", lj_parser_recurse()) asyncore.loop() import atomstream import feedparser from itertools import imap while 1: for update in imap(feedparser.parse, atomstream.connect()): login=re.findall("http://(.+)\.livejournal\.com",update.feed.link) if login and login[0] != "community" and (ljuser.get(login[0],None) == None): ljuser[login[0]]=1 http_client.do_request("http://"+login[0]+".livejournal.com/data/foaf", lj_parser_recurse())
date=re.findall("""<foaf:dateOfBirth>([^<]+)</foaf:dateOfBirth>""",self.data) posted=re.findall("""<ya:posted>([^<]+)</ya:posted>""",self.data) if nick: ljuser.append((nick and nick[0] or "",ljnick and ljnick[0] or "",name and name[0] or "",img and img[0] or "",date and date[0] or "",posted and posted[0] or "")) if(len(ljuser) >= 50): writer.writerows(ljuser) for i in range(0,len(ljuser)): ljuser.pop() return URLS = ("http://blog.yandex.ru/top/lj/",) filename = "ljlogin.txt" writer = csv.writer(open(filename, "wb"),dialect=csv.excel) for url in URLS: http_client.do_request(url, parser()) asyncore.loop() writer.writerows(ljuser)