def FromDBRow(self, row): self.freq = row['freq'] self.url = urlnorm.normalize(str(row['url'])) self.feedid = row['feedid'] self.lasttime = row['lasttime'] self.feedclass = row['feedclass'] self.channel = row['channel'] self.feedname = row['feedname'] if row['usr'] is None: cur.execute( "insert into usrs (name,password,email) values (%s,%s,%s) returning usrid;", [self.feedname, random.getrandbits(10), '*****@*****.**']) self.usr = cur.fetchone()['usrid'] cur.execute("update feeds set usr = %s where feedid = %s", [self.usr, self.feedid]) db.commit() else: self.usr = row['usr'] if row['channel'] is None: cur.execute( "insert into channels (name,postable) values (%s,%s) returning chanid", [self.feedname, False]) self.channel = cur.fetchone()['chanid'] cur.execute("update feeds set channel = %s where feedid = %s", [self.channel, self.feedid]) db.commit() else: self.channel = row['channel']
def FromDBRow(self,row): self.freq = row['freq'] self.url = urlnorm.normalize(str(row['url'])) self.feedid = row['feedid'] self.lasttime = row['lasttime'] self.feedclass = row['feedclass'] self.channel = row['channel'] if row['feedname'] is None: if self.feedclass == 1: #Reddit tempname = row['url'] tempname = tempname[tempname.find('eddit.com') -1:tempname.find('.rss')] tempname = tempname[0:35] self.feedname = tempname else: self.feedname = feedparser.parse('http://reddit.com/.rss').title[:20] cur.execute("update feeds set feedname = %s where feedid = %s",[self.feedname,self.feedid]) else: self.feedname = row['feedname'] if row['usr'] is None: cur.execute("insert into usrs (name,password,email) values (%s,%s,%s) returning usrid;",[self.feedname,random.getrandbits(10),'*****@*****.**']) self.usr = cur.fetchone()['usrid'] cur.execute("update feeds set usr = %s where feedid = %s",[self.usr,self.feedid]) db.commit() else: self.usr = row['usr'] if row['channel'] is None: cur.execute("insert into channels (name,postable) values (%s,%s) returning chanid",[self.feedname,False]) self.channel = cur.fetchone()['chanid'] cur.execute("update feeds set channel = %s where feedid = %s",[self.channel,self.feedid]) db.commit() else: self.channel = row['channel']
def FromDBRow(self,row): db = psycopg2.connect("dbname='lonava' user='******' host='localhost' password='******'") cur = db.cursor(cursor_factory=psycopg2.extras.DictCursor) self.freq = row['freq'] self.url = urlnorm.normalize(str(row['url'])) self.feedid = row['feedid'] self.lasttime = row['lasttime'] self.feedclass = row['feedclass'] self.channel = row['channel'] if row['feedname'] is None: if self.feedclass == 1: tempname = row['url'] tempname = tempname[tempname.find('eddit.com') -1:tempname.find('.rss')] tempname = tempname[0:35] self.feedname = tempname else: try: self.feedname = feedparser.parse(self.url).feed.title[:35] except: self.feedname = self.url cur.execute("update feeds set feedname = %s where feedid = %s",[self.feedname,self.feedid]) else: self.feedname = row['feedname'] if row['usr'] is None: cur.execute("insert into usrs (name,password,email) values (%s,%s,%s) returning usrid;",[self.feedname,random.getrandbits(10),'*****@*****.**']) self.usr = cur.fetchone()['usrid'] cur.execute("update feeds set usr = %s where feedid = %s",[self.usr,self.feedid]) db.commit() else: self.usr = row['usr'] if row['channel'] is None: cur.execute("insert into channels (name,postable) values (%s,%s) returning chanid",[self.feedname,False]) self.channel = cur.fetchone()['chanid'] cur.execute("update feeds set channel = %s where feedid = %s",[self.channel,self.feedid]) db.commit() else: self.channel = row['channel'] if row['lastupdated'] is None: fe = feedparser.parse(self.url) if hasattr(fe, 'updated'): upd = datetime.datetime.fromtimestamp(mktime(fe.updated)) print "Fixed time" else: upd = datetime.datetime.now() - datetime.timedelta(days=365) if upd is not None: updated = upd cur.execute("update feeds set lastupdated = %s where feedid = %s",[updated,self.feedid]) self.lastupdated = updated print "using parsed value" else: self.lastupdated = datetime.datetime.now() - datetime.timedelta(days=365) else: lastupdated = row['lastupdated'] self.lastupdated = lastupdated print "Using DB value" db.commit()
def fetchPage(workerid, logTime, clientIP, url, partial_len, header): global num_success, num_failure, num_cancelled, respHdrLimit global log_dir, staging_dir, logfile_prefix, tmpfile_prefix # build a request startTS = time.time() url = urlnorm.normalize(url) #print "### processing:", url host = url.split("/")[2] port = 80 if len(host.split(":")) > 1: port = int(host.split(":")[1]) path = "" for item in url.split("/")[3:]: path += "/" + item #agent = "Mozilla/5.0" #print url request = "GET %s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n" % (path, host) orghdr = "" for hdrline in header.split("\0\0"): orghdr += hdrline + "\r\n" #if hdrline.startswith("x-"): # # skip codeen specific optional header # continue if hdrline.startswith("Connection:"): # we've already set this properly continue if hdrline.startswith("Host:"): # we've already set this properly continue request += hdrline + "\r\n" if not request.endswith("\r\n\r\n"): request += "\r\n" if not orghdr.endswith("\r\n\r\n"): orghdr += "\r\n" #print request # make a connection s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) errno = s.connect_ex((host, port)) if errno < 0: print "connect failed", os.strerror(errno) num_failure += 1 s.close() return # send the request nSend = 0 try: nSend = s.send(request) except socket.error, msg: print "send error", msg num_failure += 1 s.close() return
def isgd(url): """ shortens a URL with the is.gd PAI """ url = urlnorm.normalize(url.encode('utf-8'), assume_scheme='http') params = urllib.urlencode({'format': 'json', 'url': url}) request = http.get_json("http://is.gd/create.php?%s" % params) if "errorcode" in request: raise ShortenError(request["errorcode"], request["errormessage"]) else: return request["shorturl"]
def isgd(url): """ shortens a URL with the is.gd API """ url = urlnorm.normalize(url.encode('utf-8'), assume_scheme='http') params = urllib.urlencode({'format': 'json', 'url': url}) request = http.get_json("http://is.gd/create.php?%s" % params) if "errorcode" in request: raise ShortenError(request["errorcode"], request["errormessage"]) else: return request["shorturl"]
def isgd(url): """ shortens a URL with the is.gd API """ url = urlnorm.normalize(url.encode('utf-8'), assume_scheme='http') req = requests.get("http://is.gd/create.php", params={ 'format': 'json', 'url': url }) try: json = req.json() except ValueError: print "[!] ERROR: is.gd returned broken json" raise if "errorcode" in json: raise ShortenError(json["errorcode"], json["errormessage"]) else: return json["shorturl"]
def FromDBRow(self, row): self.freq = row['freq'] self.url = urlnorm.normalize(str(row['url'])) self.feedid = row['feedid'] self.lasttime = row['lasttime'] self.feedclass = row['feedclass'] self.channel = row['channel'] if row['feedname'] is None: if self.feedclass == 1: #Reddit tempname = row['url'] tempname = tempname[tempname.find('eddit.com') - 1:tempname.find('.rss')] tempname = tempname[0:35] self.feedname = tempname else: self.feedname = feedparser.parse( 'http://reddit.com/.rss').title[:20] cur.execute("update feeds set feedname = %s where feedid = %s", [self.feedname, self.feedid]) else: self.feedname = row['feedname'] if row['usr'] is None: cur.execute( "insert into usrs (name,password,email) values (%s,%s,%s) returning usrid;", [self.feedname, random.getrandbits(10), '*****@*****.**']) self.usr = cur.fetchone()['usrid'] cur.execute("update feeds set usr = %s where feedid = %s", [self.usr, self.feedid]) db.commit() else: self.usr = row['usr'] if row['channel'] is None: cur.execute( "insert into channels (name,postable) values (%s,%s) returning chanid", [self.feedname, False]) self.channel = cur.fetchone()['chanid'] cur.execute("update feeds set channel = %s where feedid = %s", [self.channel, self.feedid]) db.commit() else: self.channel = row['channel']
def FromDBRow(self,row): self.freq = row['freq'] self.url = urlnorm.normalize(str(row['url'])) self.feedid = row['feedid'] self.lasttime = row['lasttime'] self.feedclass = row['feedclass'] self.channel = row['channel'] self.feedname = row['feedname'] if row['usr'] is None: cur.execute("insert into usrs (name,password,email) values (%s,%s,%s) returning usrid;",[self.feedname,random.getrandbits(10),'*****@*****.**']) self.usr = cur.fetchone()['usrid'] cur.execute("update feeds set usr = %s where feedid = %s",[self.usr,self.feedid]) db.commit() else: self.usr = row['usr'] if row['channel'] is None: cur.execute("insert into channels (name,postable) values (%s,%s) returning chanid",[self.feedname,False]) self.channel = cur.fetchone()['chanid'] cur.execute("update feeds set channel = %s where feedid = %s",[self.channel,self.feedid]) db.commit() else: self.channel = row['channel']
print "Loaded." lastrun = feed.lasttime nextrun = lastrun + datetime.timedelta(0,feed.freq) print("Lastrun" + str(lastrun)) print("Nextrun" + str(nextrun)) print("Now is " + str( datetime.datetime.now())) if datetime.datetime.now() >= nextrun: print feed.feedname.encode('ascii','ignore') allentries = feedparser.parse(feed.url) for entry in allentries.entries: print entry.title.encode('ascii','ignore') title = entry.title cur.execute("select count(*) as count from stories where url = %s and location = %s",[str(entry.link),feed.channel]) count = cur.fetchone()['count'] if( (str(entry.link).find('news.ycombinator') < 1) and (count < 1)): # no (self) links, please: link = urlnorm.normalize(str(entry.link)) if hasattr(entry,'id') is True: id = entry.id print "Link has an ID" else: id = str(entry.link) print "Link has no ID" cur.execute("select count(*) as count from stories where (url = %s or id_from_feed = %s) and location = %s",[link,str(id),feed.channel]) count = cur.fetchone()['count'] if count < 1: #New Story, for this chan. #But is it new for all of Lonava? cur.execute("select commentgroup from stories where url = %s",[link]) existing = cur.fetchall(); if len(existing) > 0:
import urlnorm #Modified url verification lib import feedparser import datetime import time #feed = "http://feeds.feedburner.com/blogspot/MKuf" feed = "http://online.wsj.com/article/SB10001424052748703447004575449490162986822.html?mod=rss_Technology" url = urlnorm.normalize(str(feed)) print feed print url
def FromDBRow(self, row): db = psycopg2.connect( "dbname='lonava' user='******' host='localhost' password='******'" ) cur = db.cursor(cursor_factory=psycopg2.extras.DictCursor) self.freq = row['freq'] self.url = urlnorm.normalize(str(row['url'])) self.feedid = row['feedid'] self.lasttime = row['lasttime'] self.feedclass = row['feedclass'] self.channel = row['channel'] if row['feedname'] is None: if self.feedclass == 1: tempname = row['url'] tempname = tempname[tempname.find('eddit.com') - 1:tempname.find('.rss')] tempname = tempname[0:35] self.feedname = tempname else: try: self.feedname = feedparser.parse(self.url).feed.title[:35] except: self.feedname = self.url cur.execute("update feeds set feedname = %s where feedid = %s", [self.feedname, self.feedid]) else: self.feedname = row['feedname'] if row['usr'] is None: cur.execute( "insert into usrs (name,password,email) values (%s,%s,%s) returning usrid;", [self.feedname, random.getrandbits(10), '*****@*****.**']) self.usr = cur.fetchone()['usrid'] cur.execute("update feeds set usr = %s where feedid = %s", [self.usr, self.feedid]) db.commit() else: self.usr = row['usr'] if row['channel'] is None: cur.execute( "insert into channels (name,postable) values (%s,%s) returning chanid", [self.feedname, False]) self.channel = cur.fetchone()['chanid'] cur.execute("update feeds set channel = %s where feedid = %s", [self.channel, self.feedid]) db.commit() else: self.channel = row['channel'] if row['lastupdated'] is None: fe = feedparser.parse(self.url) if hasattr(fe, 'updated'): upd = datetime.datetime.fromtimestamp(mktime(fe.updated)) print "Fixed time" else: upd = datetime.datetime.now() - datetime.timedelta(days=365) if upd is not None: updated = upd cur.execute( "update feeds set lastupdated = %s where feedid = %s", [updated, self.feedid]) self.lastupdated = updated print "using parsed value" else: self.lastupdated = datetime.datetime.now( ) - datetime.timedelta(days=365) else: lastupdated = row['lastupdated'] self.lastupdated = lastupdated print "Using DB value"
feed.FromDBRow(row) while datetime.datetime.now() < waituntil: print "Sleeping to be polite.." + str(datetime.datetime.now()) time.sleep(.5) waituntil = datetime.datetime.now() + datetime.timedelta(0,4) print feed.feedname.encode('ascii','ignore') allentries = feedparser.parse(feed.url) for entry in allentries.entries: print entry.title.encode('ascii','ignore') title = entry.title outwardlinks = extractExtLinks(entry.summary) if len(outwardlinks) > 0: # no (self) links, please print str(outwardlinks[0]) link = urlnorm.normalize(str(outwardlinks[0])) # There should only be one. If there is more, take the first. cur.execute("select count(*) as count from stories where url = %s and location = %s",[link,feed.channel]) count = cur.fetchone()['count'] if count < 1: #New Story, for this chan. #But is it new for all of Lonava? cur.execute("select commentgroup from stories where url = %s",[link]) existing = cur.fetchall(); if len(existing) > 0: commentgroupid = existing[0]['commentgroup'] else: commentgroupid = 0; cur.execute("insert into stories (usr,title,url,text,name,location,channame) values (%s,%s,%s,%s,%s,%s,(select name from channels where chanid = %s )) returning storyid;",[feed.usr,entry.title,link,'Via: ' + entry.link, feed.feedname, feed.channel,feed.channel]) storyid = cur.fetchone()['storyid'] if commentgroupid == 0:
while datetime.datetime.now() < waituntil: print "Sleeping to be polite.." + str(datetime.datetime.now()) time.sleep(.5) waituntil = datetime.datetime.now() + datetime.timedelta(0, 4) print feed.feedname.encode('ascii', 'ignore') allentries = feedparser.parse(feed.url) for entry in allentries.entries: print entry.title.encode('ascii', 'ignore') title = entry.title outwardlinks = extractExtLinks(entry.summary) if len(outwardlinks) > 0: # no (self) links, please print str(outwardlinks[0]) link = urlnorm.normalize( str(outwardlinks[0]) ) # There should only be one. If there is more, take the first. cur.execute( "select count(*) as count from stories where url = %s and location = %s", [link, feed.channel]) count = cur.fetchone()['count'] if count < 1: #New Story, for this chan. #But is it new for all of Lonava? cur.execute( "select commentgroup from stories where url = %s", [link]) existing = cur.fetchall() if len(existing) > 0: commentgroupid = existing[0]['commentgroup'] else:
nextrun = lastrun + datetime.timedelta(0,feed.freq) print("Lastrun" + str(lastrun)) print("Nextrun" + str(nextrun)) print("Now is " + str( datetime.datetime.now())) if datetime.datetime.now() >= nextrun: website = urllib2.urlopen(url) website_html = json.loads(website.read()) website.close() a = 0 while a < len(website_html[u'stories']): print str(a) + "/ " + str(len(website_html[u'stories'])) digglink = website_html[u'stories'][a][u'href'] title = website_html[u'stories'][a][u'title'] link = urlnorm.normalize(str(website_html[u'stories'][a][u'link'])) print str(link) cur.execute("select count(*) as count from stories where url = %s and location = %s",[link,feed.channel]) count = cur.fetchone()['count'] if count < 1: #New Story, for this chan. #But is it new for all of Lonava? cur.execute("select commentgroup from stories where url = %s",[link]) existing = cur.fetchall(); if len(existing) > 0: commentgroupid = existing[0]['commentgroup'] else: commentgroupid = 0; cur.execute("insert into stories (usr,title,url,text,name,location,channame) values (%s,%s,%s,%s,%s,%s,(select name from channels where chanid = %s )) returning storyid;",[feed.usr,title,link,'Via: ' + digglink, feed.feedname, feed.channel,feed.channel])
print("Lastrun" + str(lastrun)) print("Nextrun" + str(nextrun)) print("Now is " + str(datetime.datetime.now())) if datetime.datetime.now() >= nextrun: print feed.feedname.encode('ascii', 'ignore') allentries = feedparser.parse(feed.url) for entry in allentries.entries: print entry.title.encode('ascii', 'ignore') title = entry.title cur.execute( "select count(*) as count from stories where url = %s and location = %s", [str(entry.link), feed.channel]) count = cur.fetchone()['count'] if ((str(entry.link).find('news.ycombinator') < 1) and (count < 1)): # no (self) links, please: link = urlnorm.normalize(str(entry.link)) if hasattr(entry, 'id') is True: id = entry.id print "Link has an ID" else: id = str(entry.link) print "Link has no ID" cur.execute( "select count(*) as count from stories where (url = %s or id_from_feed = %s) and location = %s", [link, str(id), feed.channel]) count = cur.fetchone()['count'] if count < 1: #New Story, for this chan. #But is it new for all of Lonava? cur.execute(
nextrun = lastrun + datetime.timedelta(0, feed.freq) print("Lastrun" + str(lastrun)) print("Nextrun" + str(nextrun)) print("Now is " + str(datetime.datetime.now())) if datetime.datetime.now() >= nextrun: website = urllib2.urlopen(url) website_html = json.loads(website.read()) website.close() a = 0 while a < len(website_html[u'stories']): print str(a) + "/ " + str(len(website_html[u'stories'])) digglink = website_html[u'stories'][a][u'href'] title = website_html[u'stories'][a][u'title'] link = urlnorm.normalize(str(website_html[u'stories'][a][u'link'])) print str(link) cur.execute( "select count(*) as count from stories where url = %s and location = %s", [link, feed.channel]) count = cur.fetchone()['count'] if count < 1: #New Story, for this chan. #But is it new for all of Lonava? cur.execute("select commentgroup from stories where url = %s", [link]) existing = cur.fetchall() if len(existing) > 0: commentgroupid = existing[0]['commentgroup'] else: