Python normalize Examples, urlnorm.normalize Python Examples

Example #1

0

Show file

File: diggbot.py Project: freshy969/Lonava

    def FromDBRow(self, row):
        self.freq = row['freq']
        self.url = urlnorm.normalize(str(row['url']))
        self.feedid = row['feedid']
        self.lasttime = row['lasttime']
        self.feedclass = row['feedclass']
        self.channel = row['channel']
        self.feedname = row['feedname']
        if row['usr'] is None:
            cur.execute(
                "insert into usrs (name,password,email) values (%s,%s,%s) returning usrid;",
                [self.feedname,
                 random.getrandbits(10), '*****@*****.**'])
            self.usr = cur.fetchone()['usrid']
            cur.execute("update feeds set usr = %s where feedid = %s",
                        [self.usr, self.feedid])
            db.commit()
        else:
            self.usr = row['usr']

        if row['channel'] is None:
            cur.execute(
                "insert into channels (name,postable) values (%s,%s) returning chanid",
                [self.feedname, False])
            self.channel = cur.fetchone()['chanid']
            cur.execute("update feeds set channel = %s where feedid = %s",
                        [self.channel, self.feedid])
            db.commit()
        else:
            self.channel = row['channel']

Example #2

0

Show file

File: redditd.py Project: Hazer/Lonava

 def FromDBRow(self,row):
     self.freq = row['freq']
     self.url = urlnorm.normalize(str(row['url']))
     self.feedid = row['feedid']
     self.lasttime = row['lasttime']
     self.feedclass = row['feedclass']
     self.channel = row['channel']
     if row['feedname'] is None:
         if self.feedclass == 1: #Reddit
             tempname = row['url']
             tempname = tempname[tempname.find('eddit.com') -1:tempname.find('.rss')]
             tempname = tempname[0:35]   
             self.feedname = tempname
         else:
             self.feedname = feedparser.parse('http://reddit.com/.rss').title[:20]
         cur.execute("update feeds set feedname = %s where feedid = %s",[self.feedname,self.feedid])
     else:
         self.feedname = row['feedname']
     if row['usr'] is None:
         cur.execute("insert into usrs (name,password,email) values (%s,%s,%s) returning usrid;",[self.feedname,random.getrandbits(10),'*****@*****.**'])
         self.usr = cur.fetchone()['usrid']
         cur.execute("update feeds set usr = %s where feedid = %s",[self.usr,self.feedid])
         db.commit()
     else:
         self.usr = row['usr']
         if row['channel'] is None:
                 cur.execute("insert into channels (name,postable) values (%s,%s) returning chanid",[self.feedname,False])
                 self.channel = cur.fetchone()['chanid']
                 cur.execute("update feeds set channel = %s where feedid = %s",[self.channel,self.feedid])
                 db.commit()
         else:
                 self.channel = row['channel']

Example #3

0

Show file

File: genericblog.py Project: Hazer/Lonava

    def FromDBRow(self,row):
        db = psycopg2.connect("dbname='lonava' user='******' host='localhost' password='******'")
        cur = db.cursor(cursor_factory=psycopg2.extras.DictCursor)

        self.freq = row['freq']
        self.url = urlnorm.normalize(str(row['url']))
        self.feedid = row['feedid']
        self.lasttime = row['lasttime']
        self.feedclass = row['feedclass']
        self.channel = row['channel']
        if row['feedname'] is None:
            if self.feedclass == 1:
                tempname = row['url']
                tempname = tempname[tempname.find('eddit.com') -1:tempname.find('.rss')]
                tempname = tempname[0:35]   
                self.feedname = tempname
            else:
                try:
                    self.feedname = feedparser.parse(self.url).feed.title[:35]
                except:
                    self.feedname = self.url
                cur.execute("update feeds set feedname = %s where feedid = %s",[self.feedname,self.feedid])
        else:
            self.feedname = row['feedname']

        if row['usr'] is None:
            cur.execute("insert into usrs (name,password,email) values (%s,%s,%s) returning usrid;",[self.feedname,random.getrandbits(10),'*****@*****.**'])
            self.usr = cur.fetchone()['usrid']
            cur.execute("update feeds set usr = %s where feedid = %s",[self.usr,self.feedid])
            db.commit()
        else:
            self.usr = row['usr']
        if row['channel'] is None:
            cur.execute("insert into channels (name,postable) values (%s,%s) returning chanid",[self.feedname,False])
            self.channel = cur.fetchone()['chanid']
            cur.execute("update feeds set channel = %s where feedid = %s",[self.channel,self.feedid])
            db.commit()
        else:
            self.channel = row['channel']       
        if row['lastupdated'] is None:
            fe = feedparser.parse(self.url)
            if hasattr(fe, 'updated'):
                upd = datetime.datetime.fromtimestamp(mktime(fe.updated))
                print "Fixed time"
            else: 
                upd = datetime.datetime.now() - datetime.timedelta(days=365)
            if upd is not None:
                updated = upd
                cur.execute("update feeds set lastupdated = %s where feedid = %s",[updated,self.feedid])
                self.lastupdated = updated
                print "using parsed value"
            else:
                self.lastupdated = datetime.datetime.now() - datetime.timedelta(days=365)
        else:
            lastupdated = row['lastupdated']
            self.lastupdated = lastupdated 
            print "Using DB value"

        db.commit()

Example #4

0

Show file

File: udpserver.py Project: dalequark/beaglecache

def fetchPage(workerid, logTime, clientIP, url, partial_len, header):
    global num_success, num_failure, num_cancelled, respHdrLimit
    global log_dir, staging_dir, logfile_prefix, tmpfile_prefix

    # build a request
    startTS = time.time()
    url = urlnorm.normalize(url)
    #print "### processing:", url
    host = url.split("/")[2]
    port = 80
    if len(host.split(":")) > 1:
        port = int(host.split(":")[1])
    path = ""
    for item in url.split("/")[3:]:
        path += "/" + item
    #agent = "Mozilla/5.0"
    #print url
    request = "GET %s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n" % (path,
                                                                        host)
    orghdr = ""
    for hdrline in header.split("\0\0"):
        orghdr += hdrline + "\r\n"
        #if hdrline.startswith("x-"):
        #	# skip codeen specific optional header
        #	continue
        if hdrline.startswith("Connection:"):
            # we've already set this properly
            continue
        if hdrline.startswith("Host:"):
            # we've already set this properly
            continue
        request += hdrline + "\r\n"
    if not request.endswith("\r\n\r\n"):
        request += "\r\n"
    if not orghdr.endswith("\r\n\r\n"):
        orghdr += "\r\n"
    #print request

    # make a connection
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    errno = s.connect_ex((host, port))
    if errno < 0:
        print "connect failed", os.strerror(errno)
        num_failure += 1
        s.close()
        return

    # send the request
    nSend = 0
    try:
        nSend = s.send(request)
    except socket.error, msg:
        print "send error", msg
        num_failure += 1
        s.close()
        return

Example #5

0

Show file

File: udpserver.py Project: dalequark/beaglecache

def fetchPage(workerid, logTime, clientIP, url, partial_len, header):
	global num_success, num_failure, num_cancelled, respHdrLimit
	global log_dir, staging_dir, logfile_prefix, tmpfile_prefix

	# build a request
	startTS = time.time()
	url = urlnorm.normalize(url)
	#print "### processing:", url
	host = url.split("/")[2]
	port = 80
	if len(host.split(":")) > 1:
		port = int(host.split(":")[1])
	path = ""
	for item in url.split("/")[3:]:
		path += "/" + item
	#agent = "Mozilla/5.0"
	#print url
	request = "GET %s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n" % (path, host)
	orghdr = ""
	for hdrline in header.split("\0\0"):
		orghdr += hdrline + "\r\n"
		#if hdrline.startswith("x-"):
		#	# skip codeen specific optional header
		#	continue
		if hdrline.startswith("Connection:"):
			# we've already set this properly
			continue
		if hdrline.startswith("Host:"):
			# we've already set this properly
			continue
		request += hdrline + "\r\n"
	if not request.endswith("\r\n\r\n"):
		request += "\r\n"
	if not orghdr.endswith("\r\n\r\n"):
		orghdr += "\r\n"
	#print request

	# make a connection
	s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
	s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
	errno = s.connect_ex((host, port))
	if errno < 0:
		print "connect failed", os.strerror(errno)
		num_failure += 1
		s.close()
		return

	# send the request
	nSend = 0
	try:
		nSend = s.send(request)
	except socket.error, msg:
		print "send error", msg
		num_failure += 1
		s.close()
		return

Example #6

0

Show file

def isgd(url):
    """ shortens a URL with the is.gd PAI """
    url = urlnorm.normalize(url.encode('utf-8'), assume_scheme='http')
    params = urllib.urlencode({'format': 'json', 'url': url})
    request = http.get_json("http://is.gd/create.php?%s" % params)

    if "errorcode" in request:
        raise ShortenError(request["errorcode"], request["errormessage"])
    else:
        return request["shorturl"]

Example #7

0

Show file

File: web.py Project: FrozenPigs/uguubot

def isgd(url):
    """ shortens a URL with the is.gd API """
    url = urlnorm.normalize(url.encode('utf-8'), assume_scheme='http')
    params = urllib.urlencode({'format': 'json', 'url': url})
    request = http.get_json("http://is.gd/create.php?%s" % params)

    if "errorcode" in request:
        raise ShortenError(request["errorcode"], request["errormessage"])
    else:
        return request["shorturl"]

Example #8

0

Show file

File: web.py Project: nojusr/Taigabot

def isgd(url):
    """ shortens a URL with the is.gd API """
    url = urlnorm.normalize(url.encode('utf-8'), assume_scheme='http')
    req = requests.get("http://is.gd/create.php",
                       params={
                           'format': 'json',
                           'url': url
                       })

    try:
        json = req.json()
    except ValueError:
        print "[!] ERROR: is.gd returned broken json"
        raise

    if "errorcode" in json:
        raise ShortenError(json["errorcode"], json["errormessage"])
    else:
        return json["shorturl"]

Example #9

0

Show file

File: redditd.py Project: freshy969/Lonava

 def FromDBRow(self, row):
     self.freq = row['freq']
     self.url = urlnorm.normalize(str(row['url']))
     self.feedid = row['feedid']
     self.lasttime = row['lasttime']
     self.feedclass = row['feedclass']
     self.channel = row['channel']
     if row['feedname'] is None:
         if self.feedclass == 1:  #Reddit
             tempname = row['url']
             tempname = tempname[tempname.find('eddit.com') -
                                 1:tempname.find('.rss')]
             tempname = tempname[0:35]
             self.feedname = tempname
         else:
             self.feedname = feedparser.parse(
                 'http://reddit.com/.rss').title[:20]
         cur.execute("update feeds set feedname = %s where feedid = %s",
                     [self.feedname, self.feedid])
     else:
         self.feedname = row['feedname']
     if row['usr'] is None:
         cur.execute(
             "insert into usrs (name,password,email) values (%s,%s,%s) returning usrid;",
             [self.feedname,
              random.getrandbits(10), '*****@*****.**'])
         self.usr = cur.fetchone()['usrid']
         cur.execute("update feeds set usr = %s where feedid = %s",
                     [self.usr, self.feedid])
         db.commit()
     else:
         self.usr = row['usr']
         if row['channel'] is None:
             cur.execute(
                 "insert into channels (name,postable) values (%s,%s) returning chanid",
                 [self.feedname, False])
             self.channel = cur.fetchone()['chanid']
             cur.execute("update feeds set channel = %s where feedid = %s",
                         [self.channel, self.feedid])
             db.commit()
         else:
             self.channel = row['channel']

Example #10

0

Show file

File: diggbot.py Project: Hazer/Lonava

    def FromDBRow(self,row):
        self.freq = row['freq']
        self.url = urlnorm.normalize(str(row['url']))
        self.feedid = row['feedid']
        self.lasttime = row['lasttime']
        self.feedclass = row['feedclass']
        self.channel = row['channel']
        self.feedname = row['feedname']
        if row['usr'] is None:
            cur.execute("insert into usrs (name,password,email) values (%s,%s,%s) returning usrid;",[self.feedname,random.getrandbits(10),'*****@*****.**'])
            self.usr = cur.fetchone()['usrid']
            cur.execute("update feeds set usr = %s where feedid = %s",[self.usr,self.feedid])
            db.commit()
        else:
            self.usr = row['usr']

        if row['channel'] is None:
            cur.execute("insert into channels (name,postable) values (%s,%s) returning chanid",[self.feedname,False])
            self.channel = cur.fetchone()['chanid']
            cur.execute("update feeds set channel = %s where feedid = %s",[self.channel,self.feedid])
            db.commit()
        else:
            self.channel = row['channel']

Example #11

0

Show file

File: hackernews-oneoff.py Project: Hazer/Lonava

    print "Loaded."
    lastrun = feed.lasttime
    nextrun = lastrun + datetime.timedelta(0,feed.freq)
    print("Lastrun" + str(lastrun))
    print("Nextrun" + str(nextrun))
    print("Now is " + str( datetime.datetime.now()))
    if datetime.datetime.now() >= nextrun:
        print feed.feedname.encode('ascii','ignore')
        allentries = feedparser.parse(feed.url)
        for entry in allentries.entries:
            print entry.title.encode('ascii','ignore')
            title = entry.title
            cur.execute("select count(*) as count from stories where url = %s and location = %s",[str(entry.link),feed.channel])
            count = cur.fetchone()['count']
            if( (str(entry.link).find('news.ycombinator') < 1) and (count < 1)): # no (self) links, please:
                link = urlnorm.normalize(str(entry.link))
                if hasattr(entry,'id') is True:
                    id = entry.id
                    print "Link has an ID"
                else:
                    id = str(entry.link)
                    print "Link has no ID"

                cur.execute("select count(*) as count from stories where (url = %s or id_from_feed = %s) and location = %s",[link,str(id),feed.channel])
                count = cur.fetchone()['count']
                if count < 1:
                    #New Story, for this chan.
                    #But is it new for all of Lonava?
                    cur.execute("select commentgroup from stories where url = %s",[link])
                    existing = cur.fetchall();
                    if len(existing) > 0:

Example #12

0

Show file

import urlnorm    #Modified url verification lib
import feedparser
import datetime
import time

#feed = "http://feeds.feedburner.com/blogspot/MKuf"
feed = "http://online.wsj.com/article/SB10001424052748703447004575449490162986822.html?mod=rss_Technology"
url = urlnorm.normalize(str(feed))
print feed
print url

Example #13

0

Show file

    def FromDBRow(self, row):
        db = psycopg2.connect(
            "dbname='lonava' user='******' host='localhost' password='******'"
        )
        cur = db.cursor(cursor_factory=psycopg2.extras.DictCursor)

        self.freq = row['freq']
        self.url = urlnorm.normalize(str(row['url']))
        self.feedid = row['feedid']
        self.lasttime = row['lasttime']
        self.feedclass = row['feedclass']
        self.channel = row['channel']
        if row['feedname'] is None:
            if self.feedclass == 1:
                tempname = row['url']
                tempname = tempname[tempname.find('eddit.com') -
                                    1:tempname.find('.rss')]
                tempname = tempname[0:35]
                self.feedname = tempname
            else:
                try:
                    self.feedname = feedparser.parse(self.url).feed.title[:35]
                except:
                    self.feedname = self.url
                cur.execute("update feeds set feedname = %s where feedid = %s",
                            [self.feedname, self.feedid])
        else:
            self.feedname = row['feedname']

        if row['usr'] is None:
            cur.execute(
                "insert into usrs (name,password,email) values (%s,%s,%s) returning usrid;",
                [self.feedname,
                 random.getrandbits(10), '*****@*****.**'])
            self.usr = cur.fetchone()['usrid']
            cur.execute("update feeds set usr = %s where feedid = %s",
                        [self.usr, self.feedid])
            db.commit()
        else:
            self.usr = row['usr']
        if row['channel'] is None:
            cur.execute(
                "insert into channels (name,postable) values (%s,%s) returning chanid",
                [self.feedname, False])
            self.channel = cur.fetchone()['chanid']
            cur.execute("update feeds set channel = %s where feedid = %s",
                        [self.channel, self.feedid])
            db.commit()
        else:
            self.channel = row['channel']
        if row['lastupdated'] is None:
            fe = feedparser.parse(self.url)
            if hasattr(fe, 'updated'):
                upd = datetime.datetime.fromtimestamp(mktime(fe.updated))
                print "Fixed time"
            else:
                upd = datetime.datetime.now() - datetime.timedelta(days=365)
            if upd is not None:
                updated = upd
                cur.execute(
                    "update feeds set lastupdated = %s where feedid = %s",
                    [updated, self.feedid])
                self.lastupdated = updated
                print "using parsed value"
            else:
                self.lastupdated = datetime.datetime.now(
                ) - datetime.timedelta(days=365)
        else:
            lastupdated = row['lastupdated']
            self.lastupdated = lastupdated
            print "Using DB value"

Example #14

0

Show file

File: redditd.py Project: Hazer/Lonava

        feed.FromDBRow(row)
        while datetime.datetime.now() <  waituntil:
                print "Sleeping to be polite.." + str(datetime.datetime.now())
                time.sleep(.5)
        waituntil = datetime.datetime.now() + datetime.timedelta(0,4)   
        print feed.feedname.encode('ascii','ignore')

        allentries = feedparser.parse(feed.url)
        for entry in allentries.entries:
            print entry.title.encode('ascii','ignore')
            title = entry.title
            outwardlinks = extractExtLinks(entry.summary)
            if len(outwardlinks) > 0:
                # no (self) links, please
                print str(outwardlinks[0])
                link = urlnorm.normalize(str(outwardlinks[0]))  # There should only be one. If there is more, take the first.
                cur.execute("select count(*) as count from stories where url = %s and location = %s",[link,feed.channel])
                count = cur.fetchone()['count']
                if count < 1: 
                    #New Story, for this chan.
                    #But is it new for all of Lonava?
                    cur.execute("select commentgroup from stories where url = %s",[link])
                    existing = cur.fetchall();
                    if len(existing) > 0:
                        commentgroupid = existing[0]['commentgroup']
                    else:
                        commentgroupid = 0;
                        
                    cur.execute("insert into stories (usr,title,url,text,name,location,channame) values (%s,%s,%s,%s,%s,%s,(select name from channels where chanid = %s )) returning storyid;",[feed.usr,entry.title,link,'Via: ' + entry.link, feed.feedname, feed.channel,feed.channel])
                    storyid = cur.fetchone()['storyid']
                    if commentgroupid == 0:

Example #15

0

Show file

File: redditd.py Project: freshy969/Lonava

        while datetime.datetime.now() < waituntil:
            print "Sleeping to be polite.." + str(datetime.datetime.now())
            time.sleep(.5)
        waituntil = datetime.datetime.now() + datetime.timedelta(0, 4)
        print feed.feedname.encode('ascii', 'ignore')

        allentries = feedparser.parse(feed.url)
        for entry in allentries.entries:
            print entry.title.encode('ascii', 'ignore')
            title = entry.title
            outwardlinks = extractExtLinks(entry.summary)
            if len(outwardlinks) > 0:
                # no (self) links, please
                print str(outwardlinks[0])
                link = urlnorm.normalize(
                    str(outwardlinks[0])
                )  # There should only be one. If there is more, take the first.
                cur.execute(
                    "select count(*) as count from stories where url = %s and location = %s",
                    [link, feed.channel])
                count = cur.fetchone()['count']
                if count < 1:
                    #New Story, for this chan.
                    #But is it new for all of Lonava?
                    cur.execute(
                        "select commentgroup from stories where url = %s",
                        [link])
                    existing = cur.fetchall()
                    if len(existing) > 0:
                        commentgroupid = existing[0]['commentgroup']
                    else:

Example #16

0

Show file

File: diggbot.py Project: Hazer/Lonava

    nextrun = lastrun + datetime.timedelta(0,feed.freq)
    print("Lastrun" + str(lastrun))
    print("Nextrun" + str(nextrun))
    print("Now is " + str( datetime.datetime.now()))
    if datetime.datetime.now() >= nextrun:
        website = urllib2.urlopen(url)
        website_html = json.loads(website.read())
        website.close()

        a = 0

        while a < len(website_html[u'stories']):
            print str(a) + "/ " + str(len(website_html[u'stories']))
            digglink = website_html[u'stories'][a][u'href']
            title = website_html[u'stories'][a][u'title']
            link =  urlnorm.normalize(str(website_html[u'stories'][a][u'link']))
            print str(link)

            cur.execute("select count(*) as count from stories where url = %s and location = %s",[link,feed.channel])
            count = cur.fetchone()['count']
            if count < 1:
                #New Story, for this chan.
                #But is it new for all of Lonava?
                cur.execute("select commentgroup from stories where url = %s",[link])
                existing = cur.fetchall();
                if len(existing) > 0:
                    commentgroupid = existing[0]['commentgroup']
                else:
                    commentgroupid = 0;
            
                cur.execute("insert into stories (usr,title,url,text,name,location,channame) values (%s,%s,%s,%s,%s,%s,(select name from channels where chanid = %s )) returning storyid;",[feed.usr,title,link,'Via: ' + digglink, feed.feedname, feed.channel,feed.channel])

Example #17

0

Show file

    print("Lastrun" + str(lastrun))
    print("Nextrun" + str(nextrun))
    print("Now is " + str(datetime.datetime.now()))
    if datetime.datetime.now() >= nextrun:
        print feed.feedname.encode('ascii', 'ignore')
        allentries = feedparser.parse(feed.url)
        for entry in allentries.entries:
            print entry.title.encode('ascii', 'ignore')
            title = entry.title
            cur.execute(
                "select count(*) as count from stories where url = %s and location = %s",
                [str(entry.link), feed.channel])
            count = cur.fetchone()['count']
            if ((str(entry.link).find('news.ycombinator') < 1)
                    and (count < 1)):  # no (self) links, please:
                link = urlnorm.normalize(str(entry.link))
                if hasattr(entry, 'id') is True:
                    id = entry.id
                    print "Link has an ID"
                else:
                    id = str(entry.link)
                    print "Link has no ID"

                cur.execute(
                    "select count(*) as count from stories where (url = %s or id_from_feed = %s) and location = %s",
                    [link, str(id), feed.channel])
                count = cur.fetchone()['count']
                if count < 1:
                    #New Story, for this chan.
                    #But is it new for all of Lonava?
                    cur.execute(

Example #18

0

Show file

File: diggbot.py Project: freshy969/Lonava

    nextrun = lastrun + datetime.timedelta(0, feed.freq)
    print("Lastrun" + str(lastrun))
    print("Nextrun" + str(nextrun))
    print("Now is " + str(datetime.datetime.now()))
    if datetime.datetime.now() >= nextrun:
        website = urllib2.urlopen(url)
        website_html = json.loads(website.read())
        website.close()

        a = 0

        while a < len(website_html[u'stories']):
            print str(a) + "/ " + str(len(website_html[u'stories']))
            digglink = website_html[u'stories'][a][u'href']
            title = website_html[u'stories'][a][u'title']
            link = urlnorm.normalize(str(website_html[u'stories'][a][u'link']))
            print str(link)

            cur.execute(
                "select count(*) as count from stories where url = %s and location = %s",
                [link, feed.channel])
            count = cur.fetchone()['count']
            if count < 1:
                #New Story, for this chan.
                #But is it new for all of Lonava?
                cur.execute("select commentgroup from stories where url = %s",
                            [link])
                existing = cur.fetchall()
                if len(existing) > 0:
                    commentgroupid = existing[0]['commentgroup']
                else: