def test_time(self): # Assert Date + time(). v = db.date("2010-09-21 9:27:00") v = v - db.time(days=1, hours=1, minutes=1, seconds=1) self.assertEqual(str(v), "2010-09-20 08:25:59") # Assert Date + time(years, months) v = db.date(2014, 1, 31) v = v + db.time(years=1, months=1) self.assertEqual(str(v), "2015-02-28 00:00:00") print("pattern.db.time()")
def setUp(self): # Create test table. self.csv = db.CSV( rows=[ [u"Schrödinger", "cat", True, 3, db.date(2009, 11, 3)], [u"Hofstadter", "labrador", True, 5, db.date(2007, 8, 4)] ], fields=[ ["name", db.STRING], ["type", db.STRING], ["tail", db.BOOLEAN], [ "age", db.INTEGER], ["date", db.DATE], ])
def test_format(self): # Assert custom input formats. v = db.date("2010-09", "%Y-%m") self.assertEqual(str(v), "2010-09-01 00:00:00") self.assertEqual(v.year, 2010) # Assert custom output formats. v = db.date("2010-09", "%Y-%m", format="%Y-%m") self.assertEqual(v.format, "%Y-%m") self.assertEqual(str(v), "2010-09") self.assertEqual(v.year, 2010) # Assert strftime() for date < 1900. v = db.date(1707, 4, 15) self.assertEqual(str(v), "1707-04-15 00:00:00") self.assertRaises(ValueError, lambda: v.timestamp) print "pattern.db.Date.__str__()"
class TestDatabase(unittest.TestCase): def setUp(self): # Define self.db and self.type in a subclass. pass def tearDown(self): for table in self.db: self.db.drop(table) def test_escape(self): # Assert str, unicode, int, long, float, bool and None field values. for v, s in (("a", "'a'"), (u"a", "'a'"), (1, "1"), (1L, "1"), (1.0, "1.0"), (True, "1"), (False, "0"), (None, "null")): self.assertEqual(db._escape(v), s) # Assert date. v = db.date("1707-04-15") self.assertEqual(db._escape(v), "'1707-04-15 00:00:00'") # Assert current date. v = "current_timestamp" self.assertEqual(db._escape(v), "current_timestamp") # Assert subquery. v = self.db.create("dummy", fields=[db.pk()]) v = v.query() self.assertEqual(db._escape(v), "(select dummy.* from `dummy`)") # Assert MySQL and SQLite quotes. if self.db.type == db.MYSQL: self.assertEqual(self.db.escape("'"), "'\\''") if self.db.type == db.SQLITE: self.assertEqual(self.db.escape("'"), "''''") print "pattern.db._escape()"
def test_escape(self): # Assert str, unicode, int, long, float, bool and None field values. for v, s in ( ( "a", "'a'"), ( 1, "1"), (int(1), "1"), ( 1.0, "1.0"), ( True, "1"), ( False, "0"), ( None, "null")): self.assertEqual(db._escape(v), s) # Assert date. v = db.date("1707-04-15") self.assertEqual(db._escape(v), "'1707-04-15 00:00:00'") # Assert current date. v = "current_timestamp" self.assertEqual(db._escape(v), "current_timestamp") # Assert subquery. v = self.db.create("dummy", fields=[db.pk()]) v = v.query() self.assertEqual(db._escape(v), "(select dummy.* from `dummy`)") # Assert MySQL and SQLite quotes. if self.db.type == db.MYSQL: self.assertEqual(self.db.escape("'"), "'\\''") if self.db.type == db.SQLITE: self.assertEqual(self.db.escape("'"), "''''") print("pattern.db._escape()")
def test_timestamp(self): # Assert Date.timestamp. if True: raise unittest.SkipTest("FIXME see GH issue 94.") v = db.date(2010, 9, 21, format=db.DEFAULT_DATE_FORMAT) self.assertEqual(v.timestamp, 1285041600) print("pattern.db.Date.timestamp")
def test_timestamp(self): # Assert Date.timestamp. if True: raise unittest.SkipTest("FIXME see GH issue 94.") v = db.date(2010, 9, 21, format=db.DEFAULT_DATE_FORMAT) self.assertEqual(v.timestamp, 1285020000) print("pattern.db.Date.timestamp")
def test_date(self): # Assert string input and default date formats. for s in ( "2010-09-21 09:27:01", "2010-09-21T09:27:01Z", "2010-09-21T09:27:01+0000", "2010-09-21 09:27", "2010-09-21", "21/09/2010", "21 September 2010", "September 21 2010", "September 21, 2010", 1285054021): v = db.date(s) self.assertEqual(v.format, "%Y-%m-%d %H:%M:%S") self.assertEqual(v.year, 2010) self.assertEqual(v.month, 9) self.assertEqual(v.day, 21) # Assert NOW. for v in (db.date(), db.date(db.NOW)): self.assertEqual(v.year, datetime.datetime.now().year) self.assertEqual(v.month, datetime.datetime.now().month) self.assertEqual(v.day, datetime.datetime.now().day) self.assertEqual(db.date().year, db.YEAR) # Assert integer input. v1 = db.date(2010, 9, 21, format=db.DEFAULT_DATE_FORMAT) v2 = db.date(2010, 9, 21, 9, 27, 1, 0, db.DEFAULT_DATE_FORMAT) v3 = db.date(2010, 9, 21, hour=9, minute=27, second=01, format=db.DEFAULT_DATE_FORMAT) self.assertEqual(str(v1), "2010-09-21 00:00:00") self.assertEqual(str(v2), "2010-09-21 09:27:01") self.assertEqual(str(v3), "2010-09-21 09:27:01") # Assert DateError for other input. self.assertRaises(db.DateError, db.date, None) print "pattern.db.date()"
def test_csv(self): # Assert saving and loading data (field types are preserved). v = self.csv v.save("test.csv", headers=True) v = db.CSV.load("test.csv", headers=True) self.assertTrue(isinstance(v, list)) self.assertTrue(v.headers[0] == (u"name", db.STRING)) self.assertTrue(v[0] == [u"Schrödinger", "cat", True, 3, db.date(2009, 11, 3)]) os.unlink("test.csv") print "pattern.db.CSV" print "pattern.db.CSV.save()" print "pattern.db.CSV.load()"
def test_group(self): # Assert WHERE with AND/OR combinations from Group object(). yesterday = db.date() yesterday -= db.time(days=1) g1 = db.Group(("name", "garlic bread")) g2 = db.Group(("name", "pizza"), ("price", 10, "<"), operator=db.AND) g3 = db.Group(g1, g2, operator=db.OR) g4 = db.Group(g3, ("date", yesterday, ">"), operator=db.AND) self.assertEqual(g1.SQL(), "name='garlic bread'") self.assertEqual(g2.SQL(), "name='pizza' and price<10") self.assertEqual(g3.SQL(), "(name='garlic bread') or (name='pizza' and price<10)") self.assertEqual(g4.SQL(), "((name='garlic bread') or (name='pizza' and price<10)) and date>'%s'" % yesterday) # Assert subquery in group. q = self._query(fields=["name"]) g = db.any(("name", u"Gödel"), ("name", q)) self.assertEqual(g.SQL(), u"name='Gödel' or name in (select persons.name from `persons`)") print "pattern.db.Group"
def test_filterchain(self): # Assert WHERE with AND/OR combinations from FilterChain object(). yesterday = db.date() yesterday -= db.time(days=1) f1 = db.FilterChain(("name", "garlic bread")) f2 = db.FilterChain(("name", "pizza"), ("price", 10, "<"), operator=db.AND) f3 = db.FilterChain(f1, f2, operator=db.OR) f4 = db.FilterChain(f3, ("date", yesterday, ">"), operator=db.AND) self.assertEqual(f1.SQL(), "name='garlic bread'") self.assertEqual(f2.SQL(), "name='pizza' and price<10") self.assertEqual(f3.SQL(), "(name='garlic bread') or (name='pizza' and price<10)") self.assertEqual(f4.SQL(), "((name='garlic bread') or (name='pizza' and price<10)) and date>'%s'" % yesterday) # Assert subquery in filter chain. q = self._query(fields=["name"]) f = db.any(("name", u"Gödel"), ("name", q)) self.assertEqual(f.SQL(), u"name='Gödel' or name in (select persons.name from `persons`)") print "pattern.db.FilterChain"
def test_filterchain(self): # Assert WHERE with AND/OR combinations from FilterChain object(). yesterday = db.date() yesterday -= db.time(days=1) f1 = db.FilterChain(("name", "garlic bread")) f2 = db.FilterChain(("name", "pizza"), ("price", 10, "<"), operator=db.AND) f3 = db.FilterChain(f1, f2, operator=db.OR) f4 = db.FilterChain(f3, ("date", yesterday, ">"), operator=db.AND) self.assertEqual(f1.SQL(), "name='garlic bread'") self.assertEqual(f2.SQL(), "name='pizza' and price<10") self.assertEqual(f3.SQL(), "(name='garlic bread') or (name='pizza' and price<10)") self.assertEqual(f4.SQL(), "((name='garlic bread') or (name='pizza' and price<10)) and date>'%s'" % yesterday) # Assert subquery in filter chain. q = self._query(fields=["name"]) f = db.any(("name", u"Gödel"), ("name", q)) self.assertEqual(f.SQL(), u"name='Gödel' or name in (select persons.name from `persons`)") print("pattern.db.FilterChain")
def age(name): """ Returns the age of the given person. """ # Use regular expression to try and parse # a number of date formats from Wikipedia. try: w = Wikipedia(language="en") p = w.search(name, cached=True) t = DOM(p.src) s = plaintext(p.string) s = re.sub(r"\[[0-9]+\]", "", s) r = r"\(born ([0-9]+ [A-Za-z]+ )?([0-9]{4})\)" # (born 1 December 2000) x = t(".bday") y = t(".dday") x = x[0].content if x else re.search(r, s).group(2) y = y[0].content if y else str(date().year) x = plaintext(x) y = plaintext(y) x = x.split("-")[0] # YYYY-MM-DD y = y.split("-")[0] a = int(y) - int(x) return a except: pass try: r = ur"[A-Za-z]+ [0-9]+, ([0-9]{4})" r = ur"\(%s – %s\)" % (r, r) # (May 15, 1912 – October 7, 2003) x = re.search(r, s).group(1) y = re.search(r, s).group(2) a = int(y) - int(x) return a except: pass try: r = r"\(aged ([0-9]+)\)" # (aged 78) a = t(".infobox td:contains('aged')") a = a[0].content a = plaintext(a) a = re.search(r, a).group(1) a = int(a) return a except: pass return None
def insertWebpage(self, page, connection=False): idDomain = None dateVisited = None if page.url.domain: self.insertDomain(page.url.domain) idDomain = self.getDomainId(page.url.domain) if connection: dateVisited = date(NOW) try: self.db.websites.append(address=page.url.string, domain_id=idDomain, connected=connection, lastVisited=dateVisited) self.insertRelation(page.parent, page) except sqlite3.IntegrityError: if connection: self.db.websites.update(all(eq('address', page.url.string)), connected=True, lastVisited=dateVisited)
def test_time(self): # Assert Date + time(). v = db.date("2010-09-21 9:27:00") v = v - db.time(days=1, hours=1, minutes=1, seconds=1) self.assertEqual(str(v), "2010-09-20 08:25:59") print "pattern.db.time()"
def test_timestamp(self): # Assert Date.timestamp. v = db.date(2010, 9, 21, format=db.DEFAULT_DATE_FORMAT) self.assertEqual(v.timestamp, 1285020000) print "pattern.db.Date.timestamp"
# Pattern is a web mining module for the Python programming language. # It has tools for data mining (Google, Twitter and Wikipedia API, a web crawler, a HTML DOM parser), natural # language processing (part-of-speech taggers, n-gram search, sentiment analysis, WordNet), machine learning # (vector space model, clustering, SVM), network analysis and <canvas> visualization. # Web mining. # A simple web mining technique. from pattern.web import Newsfeed, plaintext from pattern.db import date from pattern.vector import Model, Document, LEMMA news, url = {}, 'http://news.google.com/news?output=rss' for story in Newsfeed().search(url, cached=False): d = str(date(story.date, format='%Y-%m-%d')) s = plaintext(story.description) # Each key in the news dictionary is a date: news is grouped per day. # Each value is a dictionary of id => story items. # We use hash(story.description) as a unique id to avoid duplicate content. news.setdefault(d, {})[hash(s)] = s # Your code will probably have some preprocessing steps to save and load the mined news updates. m = Model() for date, stories in news.items(): s = stories.values() s = ' '.join(s).lower()
from builtins import str, bytes, dict, int import os import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) from pattern.db import date, time, NOW from pattern.web import Bing, NEWS # It is often useful to keep a date stamp for each row in the table. # The pattern.db module's date() function can be used for this. # It is a simple wrapper around Python's datetime.datetime class, # with extra functionality to make it easy to parse or print it as a string. print(date(NOW)) print(date()) print(date("2010-11-01 16:30", "%Y-%m-%d %H:%M")) print(date("Nov 1, 2010", "%b %d, %Y")) print(date("Nov 1, 2010", "%b %d, %Y", format="%d/%m/%Y")) print("") # All possible formatting options: # http://docs.python.org/library/time.html#time.strftime for r in Bing(license=None, language="en").search("today", type=NEWS): print(r.title) print(repr(r.date)) # Result.date is a string (e.g. we can't > <= += with the date). print(date(r.date)) # date() can parse any Result.date in the web module. print("")
def oringinalTwInsert(filename,db): f=open(filename,'rb') forigin=open('/media/M_fM__VM_0M_eM__JM__M_eM__MM_7/tangjie/weibocontents/orgin.txt','r') conn = MySQLdb.connect(host='localhost',user='******',passwd='20090924',charset='utf8') cursor = conn.cursor() DB_NAME = 'tangdb' conn.select_db(DB_NAME) originalist=[line.strip().split() for line in forigin.readlines()] originaldict={} for l in originalist: try: originaldict[l[0]]=l[1:] except: print 'Duplicate Key' t=True c=list(['@','link']) i=1 orstr=f.readline() orlist=[] while t: try: instr=zh2unicode(orstr.strip()) if instr!=None: # print instr ortw=instr.split() ortwid=ortw[0] try: origlist=originaldict[ortwid] except: print 'key error: '+str(ortwid) exit(0) ortwuid=origlist[0] orcreAt=date(origlist[1]) orRt=origlist[2] rt=origlist[3] ortweet=zh2unicode(f.readline().strip()) orstr=zh2unicode(f.readline().strip()) mention='' link='' while orstr.split()[0] in c: if orstr.split()[0]=='@': mention=' '.join(rtstr.split()[1:]) if orstr.split()[0]=='link': link=' '.join(rtstr.split()[1:]) orstr=zh2unicode(f.readline().strip()) orRecord=(i,ortwid,ortwuid,orcreAt,ortweet,mention,link,orRt,rt) orlist.append(orRecord) i=i+1 print 'tweet: '+ortwid+' has finished!' except: t=False pass it=0 while (it<len(orlist)): try: insertlist=orlist[it:it+10000] it=it+10000 try: cursor.executemany('INSERT INTO originaltable values(%s,%s,%s,%s,%s,%s,%s,%s,%s)',insertlist) conn.commit() except IntegrityError: print "Duplicate entry for key originaltable_sid" pass print it except: insertlist=orlist[it:] try: cursor.executemany('INSERT INTO originaltable values(%s,%s,%s,%s,%s,%s,%s,%s,%s)',insertlist) conn.commit() except IntegrityError: print "Duplicate entry for key originaltable_sid" pass cursor.close() conn.close()
def retweetInsert(filename,db): f=open(filename,'rb') forigin=open('/media/M_fM__VM_0M_eM__JM__M_eM__MM_7/tangjie/weibocontents/orgin.txt','w') fdup=open('/media/M_fM__VM_0M_eM__JM__M_eM__MM_7/tangjie/weibocontents/dupretweet.txt','w') t=True c=list(['@','retweet','link']) # originaldict={} conn = MySQLdb.connect(host='localhost',user='******',passwd='20090924',charset='utf8') cursor = conn.cursor() DB_NAME = 'tangdb' conn.select_db(DB_NAME) i=1 m=0 rtstr=f.readline() rtAll=[] # dupRt=[] while t: try: instr=zh2unicode(rtstr.strip()) if instr!=None: # print instr ortw=instr.split() ortwid=ortw[0] ortwuid=ortw[1] timelist=ortw[2].strip().split('-') timestr='-'.join(timelist[:-1])+' '+timelist[-1] orcreAt=date(timestr) orRt=ortw[-1] rt=f.readline().strip() original=list([ortwid,ortwuid,str(orcreAt),orRt,rt]) forigin.write('\t'.join(original)+'\n') rwlist=[] rtstr=zh2unicode(f.readline().strip()) for j in range(int(rt)): rtlist=rtstr.split() rtuid=rtlist[0] timelist=rtlist[1].strip().split('-') timestr='-'.join(timelist[:-1])+' '+timelist[-1] rtcreAt=date(timestr) rtid=rtlist[-1] rtTw=zh2unicode(f.readline().strip()) rtstr=zh2unicode(f.readline().strip()) mention='' rtfrom='' link='' while rtstr.split()[0] in c: if rtstr.split()[0]=='@': mention=' '.join(rtstr.split()[1:]) if rtstr.split()[0]=='retweet': rtfrom=' '.join(rtstr.split()[1:]) if rtstr.split()[0]=='link': link=' '.join(rtstr.split()[1:]) rtstr=zh2unicode(f.readline().strip()) if rtid in rtAll: rtrecord=(rtid,ortwid,rtuid,str(rtcreAt),rtTw,mention,rtfrom,link) fdup.write('\t'.join(rtrecord)+'\n') else: i=i+1 rtrecord=(i,rtid,ortwid,rtuid,str(rtcreAt),rtTw,mention,rtfrom,link) rwlist.append(rtrecord) rtAll.append(rtid) try: cursor.executemany('INSERT INTO retwtable values(%s,%s,%s,%s,%s,%s,%s,%s,%s)',rwlist) conn.commit() except IntegrityError: print "Duplicate entry for key retwtable_sid" pass m=m+1 print str(m)+':retweet of '+str(ortwid)+' has finished: '+str(rt) except: t=False pass # pickle.dump(originaldict,forigin) # duplist=['\t'.join(r) for r in dupRt] # dup='\n'.join(duplist) # fdup.write(dup) cursor.close() conn.close() fdup.close() # print len(originaldict.keys()) forigin.close() print 'retweet over!'
import os, sys sys.path.insert(0, os.path.join("..", "..")) from pattern.db import date, time, NOW from pattern.web import Bing, NEWS # It is often useful to keep a date stamp for each row in the table. # The pattern.db module's date() function can be used for this. # It is a simple wrapper around Python's datetime.datetime class, # with extra functionality to make it easy to parse or print it as a string. print date(NOW) print date() print date("2010-11-01 16:30", "%Y-%m-%d %H:%M") print date("Nov 1, 2010", "%b %d, %Y") print date("Nov 1, 2010", "%b %d, %Y", format="%d/%m/%Y") print # All possible formatting options: # http://docs.python.org/library/time.html#time.strftime for r in Bing(license=None, language="en").search("today", type=NEWS): print r.title print repr( r.date ) # Result.date is a string (e.g. we can't > <= += with the date). print date(r.date) # date() can parse any Result.date in the web module. print d = date("4 november 2011") d += time(days=2, hours=5)
def user2dict(filename): f=codecs.open(filename,'rb',encoding='UTF-8') # f = codecs.EncodedFile(f,file_encoding = "utf8",data_encoding ="gb2312") filelines=f.readlines() print ''.join(filelines[:14]) i=15 while(i<len(filelines)): # print "line"+str(i) try: try: sublines=filelines[i:i+15] except: sublines=filelines[i:] # print ''.join(sublines) # if sublines[0].strip() in mapid.keys(): # userdict["mapid"]=mapid[sublines[0].strip()] # else: # userdict["mapid"]=-1 # print sublines[0].strip()+' mapto: '+str(userdict["mapid"]) userid=sublines[0].strip() try: biFollowersCount=int(sublines[1].strip()) except: biFollowersCount=0 city=sublines[2].strip() verified=sublines[3].strip() try: followersCount=int(sublines[4].strip()) except: followersCount=0 location=sublines[5].strip() province=sublines[6].strip() friendsCount=int(sublines[7].strip()) name=sublines[8].strip().replace("\\","\\\\") gender=sublines[9].strip() timelist=sublines[10].strip().split('-') timestr='-'.join(timelist[:-1])+' '+timelist[-1] createdAt=date(timestr) verifiedType=sublines[11].strip() try: statusesCount=int(sublines[12].strip()) except: statusesCount=0 description=sublines[13].strip().replace("\\","\\\\") i=i+15 userrec=[] userrec.append(userid) userrec.append(name) userrec.append(province) userrec.append(city) userrec.append(location) userrec.append(description) userrec.append(gender) userrec.append(followersCount) userrec.append(friendsCount) userrec.append(statusesCount) userrec.append(createdAt) userrec.append(verified) userrec.append(verifiedType) userrec.append(biFollowersCount) yield userrec except: print filename +" error!" exit(0)
# Python Data Science and Analytics. # Data Science is a field in computer science that is dedicated to analyzing patterns in raw data using # techniques like Artificial Intelligence (AI), Machine Learning (ML), mathematical functions, and # statistical algorithms. # Pattern is a web mining module for the Python programming language. # It has tools for data mining (Google, Twitter and Wikipedia API, a web crawler, a HTML DOM parser), natural # language processing (part-of-speech taggers, n-gram search, sentiment analysis, WordNet), machine learning # (vector space model, clustering, SVM), network analysis and <canvas> visualization. # Twitter Opinion Mining, results per day. # To do this, we need to "bin" the tweets of a politician per day (or per week, month, year) and calculate the # average sentiment of that day: from pattern.db import Datasheet, date, avg from collections import defaultdict bins = defaultdict(lambda: defaultdict(list)) for politician, party, date, score in Datasheet.load("data.csv"): d = date(row[8]) d = (d.year, d.month, d.day) bins[politician][d].append(float(score)) for politician in bins: for day in politician: bins[politician][day] = avg(bins[politician][day])