def db_insert(self, command, args): if 1: # Debugging inserts - specifically loooking for tweets not inserted *and* inserted caller = inspect.stack()[1] Print("DBWrapper", int(caller[2]), caller[3], caller[1], command, args) self.cursor.execute(command, args) #xyz if 1: self.cursor_dupe.execute(command, args) #xyz Print("DBWrapper Insert Completed")
def db_select(self, command, args=None): if 1: # Debugging inserts - specifically loooking for tweets not inserted *and* inserted caller = inspect.stack()[1] Print("DBWrapper", int(caller[2]), caller[3], caller[1], command, args) if args: self.cursor.execute(command, args) #xyz else: self.cursor.execute(command) #xyz Print("DBWrapper Select Completed")
def store_historical_status(historical_status): serialised_data = cjson.encode(historical_status) try: f = open("historical_status.json", "wb") f.write(serialised_data) f.close() except Exception, e: # Doesn't really matter what the problem is. It failed, and there's nothing this code can do about it. Print( "Failed to WRITE historical_status.json, something is badly broken" ) Print("Exception was: ", e)
def main(self): twitterurl = "http://api.twitter.com/1/users/search.json" if self.proxy: proxyhandler = urllib2.ProxyHandler({"http": self.proxy}) twitopener = urllib2.build_opener(proxyhandler) urllib2.install_opener(twitopener) headers = {'User-Agent': "BBC R&D Grabber"} postdata = None if self.keypair == False: # Perform OAuth authentication - as we don't have the secret key pair we need to request it # This will require some user input request_token_url = 'http://api.twitter.com/oauth/request_token' access_token_url = 'http://api.twitter.com/oauth/access_token' authorize_url = 'http://api.twitter.com/oauth/authorize' token = None consumer = oauth.Consumer(key=self.consumerkeypair[0], secret=self.consumerkeypair[1]) params = { 'oauth_version': "1.0", 'oauth_nonce': oauth.generate_nonce(), 'oauth_timestamp': int(time.time()), } params['oauth_consumer_key'] = consumer.key req = oauth.Request(method="GET", url=request_token_url, parameters=params) signature_method = oauth.SignatureMethod_HMAC_SHA1() req.sign_request(signature_method, consumer, token) requestheaders = req.to_header() requestheaders['User-Agent'] = "BBC R&D Grabber" # Connect to Twitter try: req = urllib2.Request( request_token_url, None, requestheaders ) # Why won't this work?!? Is it trying to POST? conn1 = urllib2.urlopen(req) except httplib.BadStatusLine, e: Print("PeopleSearch BadStatusLine error:", e) conn1 = False except urllib2.HTTPError, e: Print("PeopleSearch HTTP error:", e.code) # sys.stderr.write('PeopleSearch HTTP error: ' + str(e.code) + '\n') conn1 = False
def get_historical_status(): try: f = open("historical_status.json") raw_data = f.read() f.close() historical_status = cjson.decode(raw_data) except Exception, e: # Doesn't really matter what the problem is. It failed, warn, and use a default. Print("Failed to read historical_status.json, creating a new one") Print("Exception was: ", e) historical_status = []
def main(self): c = 0 while True: if os.path.exists(self.stopfile): Print("Stop File Exists - Exitting") self.die() time.sleep(1)
def __init__(self, *argv, **argd): # This is to ensure that we play nicely inside a general hierarchy # Even though we inherit frm object. Otherwise we risk breaking the MRO of the class # We're used with. # These should all succeed now... Print("db.user, pass, maindb, nextdb", self.dbuser, self.dbpass, self.maindb, self.nextdb) super(DBWrapper, self).__init__(*argv, **argd) # Now configured centrally, but can be still overridden in the usual kamaelia way :-) self.cursor = None # xyz # dupe self.cursor_dupe = None # xyz #dupe
def killBookmarksProcess(): processline = os.popen( 'ps 2>/dev/null -aux|grep Bookmarks.py|grep -v grep').read() if processline: tokens = processline.split() user, pid, pcpu, pmem, vsz, rss, tty, stat, start, time, command = tokens[: 11] args = tokens[11:] os.system("kill -9 %s" % pid) else: Print( "Bookmarks.py is not running. This means the shell process wrapping it isn't starting it, or is dead. This program cannot fix that problem." )
def main(self): # sys.stdout.write("Handle Connect Request Start\n") self.control_message = None self.had_response = False buff = Linebuffer() lines = [] fail = False try: while True: for data in self.Inbox("inbox"): buff.feed(data) while buff.chompable(): line = buff.chomp() lines.append(line) if line == "\r\n": # We've now got the complete header. # We're now expecting a body, but SHOULD handle it. # For now, let's just handle the response line since it's all we really care about. rest = lines[1:] rline = lines[0] p = rline.find(" ") if p == -1: raise GeneralFail("HTTP Response Line Parse Failure: "+ repr(http_response_line)) http_version = rline[:p] rline = rline[p+1:] p = rline.find(" ") if p == -1: raise GeneralFail("HTTP Response Line Parse Failure: "+ repr(rline)) http_status = rline[:p] human_status = rline[p+1:] if 0: Print ("http_version,http_status,human_status",http_version,http_status,human_status) if http_status != "200": raise GeneralFail("HTTP Connect Failure : "+ repr(rline)) self.had_response = True self.checkControl() if not self.anyReady(): self.pause() yield 1 except ShutdownNow: pass except GeneralFail, e: # Yes, we're masking an error. This is nasty. fail = True
def main(self): try: while True: for data in self.Inbox("inbox"): self.send(self.tag + " : " + str(data), "outbox") for data in self.Inbox("togglebox"): Print( "toggling" ) self.tag = self.tag[-1::-1] # Reverse it. self.checkControl() if not self.anyReady(): self.pause() yield 1 except ShutdownNow: pass Print( "exitting tagger" ) if self.control_message: self.send(self.control_message, "signal") else: self.send(Axon.Ipc.producerFinished(), "signal")
def main(self): while not self.finished(): if self.dataReady("inbox"): # TODO This component is unfinished as it was never found to be needed channel = self.recv("inbox") time.sleep( 1) # Temporary delay to ensure not hammering /programmes nowplayingurl = "http://www.bbc.co.uk" + self.channels[ channel] + ".json" npdata = None # Grab BBC data self.send([nowplayingurl], "dataout") while not self.dataReady("datain"): yield 1 self.pause() recvdata = self.recv("datain") if recvdata[0] == "OK": content = recvdata[1] else: content = None # Read and decode now playing info if content != None: try: decodedcontent = cjson.decode(content) except cjson.DecodeError: e = sys.exc_info()[1] Print("cjson.DecodeError:", e.message) # Analyse now playing info if decodedcontent: # Not finished! - now playing json file is empty if nothing is playing! npdata = False self.send(npdata, "outbox") self.pause() yield 1
def MySQL_Running(): processline = os.popen( 'ps 2>/dev/null -aux|grep mysqld|grep -v grep').read() if processline: return True else: return False try: homedir = os.path.expanduser( "~") # Bootstrap from /root/, but use this to find the rest config_file = open(homedir + "/twitter-login.conf") except IOError, e: Print("Failed to load login data - exiting") sys.exit(0) raw_config = config_file.read() config_file.close() config = cjson.decode(raw_config) username = config['dbuser'] password = config['dbpass'] unixuser = config['unixuser'] homedir = os.path.expanduser("~" + unixuser) os.system("touch " + homedir + "/stop_bookmarks") time.sleep(1)
"WARNING - no tweet words analysed in 1 period, might be dead. Waiting 1 period" ) if deltas["analyseddata"][-1] == 0: Print( "WARNING - no tweets analysed in 1 period, might be dead. Waiting 1 period" ) return False try: homedir = os.path.expanduser( "~") # Bootstrap from /root/, but use this to find the rest config_file = open(homedir + "/twitter-login.conf") except IOError, e: Print("Failed to load login data - exiting") sys.exit(0) raw_config = config_file.read() config_file.close() config = cjson.decode(raw_config) username = config['dbuser'] password = config['dbpass'] unixuser = config['unixuser'] homedir = os.path.expanduser("~" + unixuser) state = get_database_state() historical_status = get_historical_status() add_current_state(historical_status, state) store_historical_status(historical_status)
def main(self): twitterurl = "http://api.twitter.com/1/users/search.json" if self.proxy: proxyhandler = urllib2.ProxyHandler({"http": self.proxy}) twitopener = urllib2.build_opener(proxyhandler) urllib2.install_opener(twitopener) headers = {'User-Agent': "BBC R&D Grabber"} postdata = None if self.keypair == False: # Perform OAuth authentication - as we don't have the secret key pair we need to request it # This will require some user input request_token_url = 'http://api.twitter.com/oauth/request_token' access_token_url = 'http://api.twitter.com/oauth/access_token' authorize_url = 'http://api.twitter.com/oauth/authorize' token = None consumer = oauth.Consumer(key=self.consumerkeypair[0], secret=self.consumerkeypair[1]) params = { 'oauth_version': "1.0", 'oauth_nonce': oauth.generate_nonce(), 'oauth_timestamp': int(time.time()), } params['oauth_consumer_key'] = consumer.key req = oauth.Request(method="GET", url=request_token_url, parameters=params) signature_method = oauth.SignatureMethod_HMAC_SHA1() req.sign_request(signature_method, consumer, token) requestheaders = req.to_header() requestheaders['User-Agent'] = "BBC R&D Grabber" # Connect to Twitter try: req = urllib2.Request( request_token_url, None, requestheaders ) # Why won't this work?!? Is it trying to POST? conn1 = urllib2.urlopen(req) except httplib.BadStatusLine: e = sys.exc_info()[1] Print("PeopleSearch BadStatusLine error:", e) conn1 = False except urllib2.HTTPError: e = sys.exc_info()[1] Print("PeopleSearch HTTP error:", e.code) # sys.stderr.write('PeopleSearch HTTP error: ' + str(e.code) + '\n') conn1 = False except urllib2.URLError: e = sys.exc_info()[1] Print("PeopleSearch URL error: ", e.reason) # sys.stderr.write('PeopleSearch URL error: ' + str(e.reason) + '\n') conn1 = False if conn1: content = conn1.read() conn1.close() request_token = dict(urlparse.parse_qsl(content)) Print("Request Token:") Print(" - oauth_token = ", request_token['oauth_token']) Print(" - oauth_token_secret = ", request_token['oauth_token_secret']) Print("") # The user must confirm authorisation so a URL is Printed here Print("Go to the following link in your browser:") Print("%s?oauth_token=%s" % (authorize_url, request_token['oauth_token'])) Print("") accepted = 'n' # Wait until the user has confirmed authorisation while accepted.lower() == 'n': accepted = raw_input('Have you authorized me? (y/n) ') oauth_verifier = raw_input('What is the PIN? ') token = oauth.Token(request_token['oauth_token'], request_token['oauth_token_secret']) token.set_verifier(oauth_verifier) params = { 'oauth_version': "1.0", 'oauth_nonce': oauth.generate_nonce(), 'oauth_timestamp': int(time.time()), } params['oauth_token'] = token.key params['oauth_consumer_key'] = consumer.key req = oauth.Request(method="GET", url=access_token_url, parameters=params) signature_method = oauth.SignatureMethod_HMAC_SHA1() req.sign_request(signature_method, consumer, token) requestheaders = req.to_header() requestheaders['User-Agent'] = "BBC R&D Grabber" # Connect to Twitter try: req = urllib2.Request( access_token_url, "oauth_verifier=%s" % oauth_verifier, requestheaders ) # Why won't this work?!? Is it trying to POST? conn1 = urllib2.urlopen(req) except httplib.BadStatusLine: e = sys.exc_info()[1] # sys.stderr.write('PeopleSearch BadStatusLine error: ' + str(e) + '\n') Print('PeopleSearch BadStatusLine error: ', e) conn1 = False except urllib2.HTTPError: e = sys.exc_info()[1] Print('PeopleSearch HTTP error: ', e.code) conn1 = False except urllib2.URLError: e = sys.exc_info()[1] # sys.stderr.write('PeopleSearch URL error: ' + str(e.reason) + '\n') Print('PeopleSearch URL error: ', e.reason) conn1 = False if conn1: content = conn1.read() conn1.close() access_token = dict(urlparse.parse_qsl(content)) # Access tokens retrieved from Twitter Print("Access Token:") Print(" - oauth_token = ", access_token['oauth_token']) Print(" - oauth_token_secret = ", access_token['oauth_token_secret']) Print("") Print( "You may now access protected resources using the access tokens above." ) Print("") save = False # Load config to save OAuth keys try: homedir = os.path.expanduser("~") file = open(homedir + "/twitter-login.conf", 'r') save = True except IOError: e = sys.exc_info()[1] Print( "Failed to load config file - not saving oauth keys: ", e) if save: raw_config = file.read() file.close() # Read config and add new values config = cjson.decode(raw_config) config['key'] = access_token['oauth_token'] config['secret'] = access_token['oauth_token_secret'] raw_config = cjson.encode(config) # Write out the new config file try: file = open(homedir + "/twitter-login.conf", 'w') file.write(raw_config) file.close() except IOError: e = sys.exc_info()[1] Print("Failed to save oauth keys: ", e) self.keypair = [ access_token['oauth_token'], access_token['oauth_token_secret'] ] while not self.finished(): # TODO: Implement backoff algorithm in case of connection failures - watch out for the fact this could delay the requester component if self.dataReady("inbox"): # Retieve keywords to look up person = self.recv("inbox") # Ensure we're not rate limited during the first request - if so we'll wait for 15 mins before our next request if (datetime.today() - timedelta(minutes=15)) > self.ratelimited: requesturl = twitterurl + "?q=" + urllib.quote( person) + "&per_page=5" params = { 'oauth_version': "1.0", 'oauth_nonce': oauth.generate_nonce(), 'oauth_timestamp': int(time.time()), } token = oauth.Token(key=self.keypair[0], secret=self.keypair[1]) consumer = oauth.Consumer(key=self.consumerkeypair[0], secret=self.consumerkeypair[1]) params['oauth_token'] = token.key params['oauth_consumer_key'] = consumer.key req = oauth.Request(method="GET", url=requesturl, parameters=params) signature_method = oauth.SignatureMethod_HMAC_SHA1() req.sign_request(signature_method, consumer, token) requestheaders = req.to_header() requestheaders['User-Agent'] = "BBC R&D Grabber" # Connect to Twitter try: req = urllib2.Request( requesturl, None, requestheaders ) # Why won't this work?!? Is it trying to POST? conn1 = urllib2.urlopen(req) except httplib.BadStatusLine: e = sys.exc_info()[1] # sys.stderr.write('PeopleSearch BadStatusLine error: ' + str(e) + '\n') Print('PeopleSearch BadStatusLine error: ', e) conn1 = False except urllib2.HTTPError: e = sys.exc_info()[1] # sys.stderr.write('PeopleSearch HTTP error: ' + str(e.code) + '\n') Print('PeopleSearch HTTP error: ', e.code) conn1 = False except urllib2.URLError: e = sys.exc_info()[1] # sys.stderr.write('PeopleSearch URL error: ' + str(e.reason) + '\n') Print('PeopleSearch URL error: ', e.reason) conn1 = False if conn1: # Check rate limiting here and Print current limit headers = conn1.info() try: headerlist = string.split(str(headers), "\n") except UnicodeEncodeError: # str may fail... headerlist = [] for line in headerlist: if line != "": splitheader = line.split() if splitheader[ 0] == "X-FeatureRateLimit-Remaining:" or splitheader[ 0] == "X-RateLimit-Remaining:": Print(splitheader[0], " ", splitheader[1]) if int(splitheader[1]) < 5: self.ratelimited = datetime.today() # Grab json format result of people search here try: data = conn1.read() try: content = cjson.decode(data) self.send(content, "outbox") except cjson.DecodeError: self.send(dict(), "outbox") except IOError: e = sys.exc_info()[1] # sys.stderr.write('PeopleSearch IO error: ' + str(e) + '\n') Print('PeopleSearch IO error: ', e) self.send(dict(), "outbox") conn1.close() else: self.send(dict(), "outbox") else: Print("Twitter search paused - rate limited") self.send(dict(), "outbox") self.pause() yield 1
def main(self): while not self.finished(): if self.dataReady("inbox"): channel = self.recv("inbox") time.sleep( 1) # Temporary delay to ensure not hammering /programmes # Setup in case of URL errors later data = None # Define URLs for getting schedule data and DVB bridge information # By BBC convention, schedule info runs to 5am the next day if datetime.utcnow().hour < 5: scheduleurl = "http://www.bbc.co.uk" + self.channels[ channel][1] + "/" + time.strftime( "%Y/%m/%d", time.gmtime(time.time() - 86400)) + ".json" else: scheduleurl = "http://www.bbc.co.uk" + self.channels[ channel][1] + "/" + time.strftime( "%Y/%m/%d", time.gmtime(time.time())) + ".json" #syncschedurl = "http://beta.kamaelia.org:8082/dvb-bridge?command=channel&args=" + urllib.quote(self.channels[channel][0]) #synctimeurl = "http://beta.kamaelia.org:8082/dvb-bridge?command=time" syncschedurl = "http://10.92.164.147:8082/dvb-bridge?command=channel&args=" + urllib.quote( self.channels[channel][0]) synctimeurl = "http://10.92.164.147:8082/dvb-bridge?command=time" content = None # # Grab SyncTV time data to work out the offset between local (NTP) and BBC time (roughly) # self.send([synctimeurl], "dataout") # while not self.dataReady("datain"): # self.pause() # yield 1 # recvdata = self.recv("datain") # if recvdata[0] == "OK": # content = recvdata[1] # else: # content = None # Work out time difference between local time and BBC time if content != None: try: decodedcontent = cjson.decode(content) if decodedcontent[0] == "OK": difference = time.time( ) - decodedcontent[2]['time'] except cjson.DecodeError, e: Print("cjson.DecodeError:", e.message) if 'difference' in locals(): # FIXME *SOB* # Grab actual programme start time from DVB bridge channel page self.send([syncschedurl], "dataout") while not self.dataReady("datain"): self.pause() # Add timeout ? # yield 1 recvdata = self.recv("datain") if recvdata[0] == "OK": content = recvdata[1] else: content = None if content != None: try: decodedcontent = cjson.decode(content) if decodedcontent[0] == "OK": proginfo = decodedcontent[2]['info'] except cjson.DecodeError, e: Print("cjson.DecodeError:", e.message) # Grab BBC schedule data for given channel self.send([scheduleurl], "dataout") while not self.dataReady("datain"): self.pause() # FIXME Add timeout? # yield 1 recvdata = self.recv("datain") if recvdata[0] == "OK": content = recvdata[1] else: content = None # Read and decode schedule if content != None: try: decodedcontent = cjson.decode(content) except cjson.DecodeError, e: Print("cjson.DecodeError:", e.message) if 'proginfo' in locals(): showdate = proginfo['NOW']['startdate'] showtime = proginfo['NOW']['starttime'] actualstart = proginfo['changed'] showdatetime = datetime.strptime( str(showdate[0]) + "-" + str(showdate[1]) + "-" + str(showdate[2]) + " " + str(showtime[0]) + ":" + str(showtime[1]) + ":" + str(showtime[2]), "%Y-%m-%d %H:%M:%S") # SyncTV (DVB Bridge) produced data - let's trust that if 'decodedcontent' in locals(): for programme in decodedcontent['schedule']['day'][ 'broadcasts']: starttime = parse(programme['start']) gmt = pytz.timezone("GMT") starttime = starttime.astimezone(gmt) starttime = starttime.replace(tzinfo=None) # Attempt to identify which DVB bridge programme corresponds to the /programmes schedule to get PID if showdatetime == starttime or ( showdatetime + timedelta(minutes=1) == starttime and string.lower( proginfo['NOW']['name']) == string.lower( programme['programme'] ['display_titles']['title']) ) or (showdatetime - timedelta(minutes=1) == starttime and string.lower(proginfo['NOW']['name']) == string.lower( programme['programme'] ['display_titles']['title'])): duration = ( proginfo['NOW']['duration'][0] * 60 * 60 ) + (proginfo['NOW']['duration'][1] * 60) + proginfo['NOW']['duration'][2] progdate = parse(programme['start']) tz = progdate.tzinfo utcoffset = datetime.strptime( str(tz.utcoffset(progdate)), "%H:%M:%S") utcoffset = utcoffset.hour * 60 * 60 # Something's not right with the code below #TODO #FIXME timestamp = time.mktime( showdatetime.timetuple()) + utcoffset if 'difference' in locals(): offset = (timestamp - actualstart) - difference else: offset = timestamp - actualstart pid = programme['programme']['pid'] title = programme['programme'][ 'display_titles']['title'] # Fix for unicode errors caused by some /programmes titles if (not isinstance(title, str)) and ( not isinstance(title, unicode)): title = str(title) Print(pid, title, offset, duration, showdatetime, "GMT", utcoffset) data = [ pid, title, offset, duration, timestamp, utcoffset ] else: # Couldn't use the DVB Bridge, so work out what's on NOW here utcdatetime = datetime.now() # Analyse schedule if 'decodedcontent' in locals(): for programme in decodedcontent['schedule']['day'][ 'broadcasts']: starttime = parse(programme['start']) starttime = starttime.replace(tzinfo=None) endtime = parse(programme['end']) endtime = endtime.replace(tzinfo=None) if (utcdatetime >= starttime) & (utcdatetime < endtime): pid = programme['programme']['pid'] title = programme['programme'][ 'display_titles']['title'] # Fix for unicode errors caused by some /programmes titles if (not isinstance(title, str)) and ( not isinstance(title, unicode)): title = str(title) # Has to assume no offset between scheduled and actual programme start time as it knows no better because of the lack of DVB bridge progdate = parse(programme['start']) tz = progdate.tzinfo utcoffset = datetime.strptime( str(tz.utcoffset(progdate)), "%H:%M:%S") utcoffset = utcoffset.hour * 60 * 60 timestamp = time.mktime( progdate.timetuple()) - utcoffset Print(pid, title, 0, programme['duration'], programme['start'], utcoffset) data = [ pid, title, 0, programme['duration'], timestamp, utcoffset ]
class PeopleSearch(component): Inboxes = { "inbox": "Receives string indicating a person's name", "control": "" } Outboxes = { "outbox": "Outputs raw search output from Twitter people search in JSON", "signal": "" } def __init__(self, consumerkeypair, keypair, proxy=False): super(PeopleSearch, self).__init__() self.proxy = proxy self.consumerkeypair = consumerkeypair self.keypair = keypair self.ratelimited = datetime.today() - timedelta(minutes=15) def finished(self): while self.dataReady("control"): msg = self.recv("control") if isinstance(msg, producerFinished) or isinstance( msg, shutdownMicroprocess): self.send(msg, "signal") return True return False def main(self): twitterurl = "http://api.twitter.com/1/users/search.json" if self.proxy: proxyhandler = urllib2.ProxyHandler({"http": self.proxy}) twitopener = urllib2.build_opener(proxyhandler) urllib2.install_opener(twitopener) headers = {'User-Agent': "BBC R&D Grabber"} postdata = None if self.keypair == False: # Perform OAuth authentication - as we don't have the secret key pair we need to request it # This will require some user input request_token_url = 'http://api.twitter.com/oauth/request_token' access_token_url = 'http://api.twitter.com/oauth/access_token' authorize_url = 'http://api.twitter.com/oauth/authorize' token = None consumer = oauth.Consumer(key=self.consumerkeypair[0], secret=self.consumerkeypair[1]) params = { 'oauth_version': "1.0", 'oauth_nonce': oauth.generate_nonce(), 'oauth_timestamp': int(time.time()), } params['oauth_consumer_key'] = consumer.key req = oauth.Request(method="GET", url=request_token_url, parameters=params) signature_method = oauth.SignatureMethod_HMAC_SHA1() req.sign_request(signature_method, consumer, token) requestheaders = req.to_header() requestheaders['User-Agent'] = "BBC R&D Grabber" # Connect to Twitter try: req = urllib2.Request( request_token_url, None, requestheaders ) # Why won't this work?!? Is it trying to POST? conn1 = urllib2.urlopen(req) except httplib.BadStatusLine, e: Print("PeopleSearch BadStatusLine error:", e) conn1 = False except urllib2.HTTPError, e: Print("PeopleSearch HTTP error:", e.code) # sys.stderr.write('PeopleSearch HTTP error: ' + str(e.code) + '\n') conn1 = False except urllib2.URLError, e: Print("PeopleSearch URL error: ", e.reason) # sys.stderr.write('PeopleSearch URL error: ' + str(e.reason) + '\n') conn1 = False
def main(self): self.dbConnect() while not self.finished(): twitdata = list() # Collect all current received tweet JSON and their related PIDs into a twitdata list while self.dataReady("inbox"): pids = list() data = self.recv("inbox") for pid in data[1]: pids.append(pid) twitdata.append([data[0], pids]) if len(twitdata) > 0: # Process the received twitdata for tweet in twitdata: tweet[0] = tweet[0].replace( "\\/", "/" ) # Fix slashes in links: This may need moving further down the line - ideally it would be handled by cjson if tweet[0] != "\r\n": # If \r\n is received, this is just a keep alive signal from Twitter every 30 secs # At this point, each 'tweet' contains tweetdata, and a list of possible pids newdata = cjson.decode(tweet[0]) if newdata.has_key('delete') or newdata.has_key( 'scrub_geo') or newdata.has_key('limit'): # Keep a record of all requests from Twitter for deletions, location removal etc # As yet none of these have been received, but this code will store them if they are received to enable debugging filepath = "contentDebug.txt" if os.path.exists(filepath): file = open(filepath, 'r') filecontents = file.read() else: filecontents = "" file = open(filepath, 'w') file.write(filecontents + "\n" + str(datetime.utcnow()) + " " + cjson.encode(newdata)) file.close() else: # This is a real tweet tweetid = newdata['id'] try: Print("New tweet! @", repr(newdata['user']['screen_name']), ": " + repr(newdata['text'])) except UnicodeEncodeError, e: Print("Unicode error suppressed", e) for pid in tweet[1]: # Cycle through possible pids, grabbing that pid's keywords from the DB # Then, check this tweet against the keywords and save to DB where appropriate (there may be more than one location) self.db_select( """SELECT keyword,type FROM keywords WHERE pid = %s""", (pid)) data = self.db_fetchall() for row in data: # Some keywords are stored with a ^. These must be split, and the tweet checked to see if it has both keywords, but not necessarily next to each other keywords = row[0].split("^") if len(keywords) == 2: if string.lower( keywords[0]) in string.lower( newdata['text'] ) and string.lower( keywords[1] ) in string.lower( newdata['text']): self.db_select( """SELECT timestamp,timediff FROM programmes WHERE pid = %s ORDER BY timestamp DESC""", (pid)) progdata = self.db_fetchone() if progdata != None: # Ensure the user hasn't already tweeted the same text # Also ensure they haven't tweeted in the past 10 seconds timestamp = time.mktime( parse(newdata['created_at'] ).timetuple()) self.db_select( """SELECT * FROM rawdata WHERE (pid = %s AND text = %s AND user = %s) OR (pid = %s AND user = %s AND timestamp >= %s AND timestamp < %s)""", (pid, newdata['text'], newdata['user'] ['screen_name'], pid, newdata['user'] ['screen_name'], timestamp - 10, timestamp + 10)) if self.db_fetchone() == None: Print( "Storing tweet for pid ", pid) # Work out where this tweet really occurred in the programme using timestamps and DVB bridge data progposition = timestamp - ( progdata[0] - progdata[1]) self.db_insert( """INSERT INTO rawdata (tweet_id,pid,timestamp,text,user,programme_position) VALUES (%s,%s,%s,%s,%s,%s)""", (tweetid, pid, timestamp, newdata['text'], newdata['user'] ['screen_name'], progposition)) break # Break out of this loop and back to check the same tweet against the next programme else: Print( "Duplicate tweet from user - ignoring" ) if string.lower(row[0]) in string.lower( newdata['text']): self.db_select( """SELECT timestamp,timediff FROM programmes WHERE pid = %s ORDER BY timestamp DESC""", (pid)) progdata = self.db_fetchone() if progdata != None: # Ensure the user hasn't already tweeted the same text for this programme # Also ensure they haven't tweeted in the past 10 seconds timestamp = time.mktime( parse(newdata['created_at']). timetuple()) self.db_select( """SELECT * FROM rawdata WHERE (pid = %s AND text = %s AND user = %s) OR (pid = %s AND user = %s AND timestamp >= %s AND timestamp < %s)""", (pid, newdata['text'], newdata['user'] ['screen_name'], pid, newdata['user'] ['screen_name'], timestamp - 10, timestamp + 10)) if self.db_fetchone() == None: Print("Storing tweet for pid ", pid) # Work out where this tweet really occurred in the programme using timestamps and DVB bridge data progposition = timestamp - ( progdata[0] - progdata[1]) self.db_insert( """INSERT INTO rawdata (tweet_id,pid,timestamp,text,user,programme_position) VALUES (%s,%s,%s,%s,%s,%s)""", (tweetid, pid, timestamp, newdata['text'], newdata['user'] ['screen_name'], progposition)) break # Break out of this loop and back to check the same tweet against the next programme else: Print( "Duplicate tweet from user - ignoring" ) else: Print("Blank line received from Twitter - no new data") Print("Done!") # new line to break up display else: time.sleep(0.1)
# It will also create files called namecache.conf, linkcache.conf and oversizedtweets.conf in your home directory # See the README for more information # Before we do anything. # First check to see if we're suppose to be even running. If we're not, don't start! import os import sys from Kamaelia.Apps.SocialBookmarks.Print import Print # Before we do anything. # First check to see if we're suppose to be even running. If we're not, don't start! if os.path.exists(os.path.join(os.path.expanduser("~"), "stop_bookmarks")): Print("Exitting bookmarks because ~/stop_bookmarks exists") start = False sys.exit(0) else: start = True # import Axon # Axon.Box.ShowAllTransits = True if start and (__name__ == "__main__"): import cjson from Kamaelia.Apps.SocialBookmarks.BBCProgrammes import WhatsOn from Kamaelia.Apps.SocialBookmarks.DataCollector import DataCollector, RawDataCollector
def main(self): # Print( "With component starting...") self.addChildren(self.item) self.item.activate() try: dontcontinue = False for graphstep in self.sequence: # Print( "Next/this graphstep :", graphstep) stopping = 0 if dontcontinue: break links = self.link_graphstep(graphstep) while True: # Let sub graphstep run, and wait for completion. Sleep as much as possible. if not self.anyReady(): self.pause() yield 1 self.checkControl() # Told by the outside world to shutdown dontcontinue = self.handleGraphstepShutdown() # Components inside have shutdown.. if self.anyStopped(): # print "Something stopped" all_stopped = True # Assume if self.item._isStopped(): Print( "Warning: Child died before completion", self.item ) self.shutdownChildComponents(shutdownMicroprocess()) dontcontinue = True for child in self.childComponents(): # Check assumption if child == self.item: continue # print "child stopped ?", child._isStopped(), child all_stopped = all_stopped and child._isStopped() if all_stopped: break else: stopping += 1 if (stopping % 1000) == 0: pass # print "Warning one child exited, but others haven't after", stopping, "loops" yield 1 if dontcontinue: break for link in links: self.unlink(thelinkage=link) # print "Exiting With Component... , all_stopped, dontcontinue:", all_stopped, dontcontinue self.link( (self, "_signal"), (self.item, "control") ) self.send( producerFinished(), "_signal") except ShutdownNow: # Print( "Shutting Down Now") self.shutdownChildComponents(self.control_message) # Print( "Sending shutdown to The Item") self.link( (self, "_signal"), (self.item, "control") ) self.send( self.control_message, "_signal")
def main(self): # Calculate running total and mean etc self.dbConnect() while not self.finished(): # The below does LIVE and FINAL analysis - do NOT run DataAnalyser at the same time Print("Analysis component: Checking for new data...") # Stage 1: Live analysis - could do with a better way to do the first query (indexed field 'analsed' to speed up for now) # Could move this into the main app to take a copy of tweets on arrival, but would rather solve separately if poss self.db_select( """SELECT tid,pid,timestamp,text,tweet_id,programme_position FROM rawdata WHERE analysed = 0 ORDER BY tid LIMIT 5000""" ) data = self.db_fetchall() # Cycle through all the as yet unanalysed tweets for result in data: tid = result[0] pid = result[1] tweettime = result[ 2] # Timestamp based on the tweet's created_at field tweettext = result[3] tweetid = result[ 4] # This is the real tweet ID, tid just makes a unique identifier as each tweet can be stored against several pids progpos = result[ 5] # Position through the programme that the tweet was made dbtime = datetime.utcfromtimestamp(tweettime) # Each tweet will be grouped into chunks of one minute to make display better, so set the seconds to zero # This particular time is only used for console display now as a more accurate one calculated from programme position is found later dbtime = dbtime.replace(second=0) Print("Analysis component: Analysing new tweet for pid", pid, "(", dbtime, "):") try: Print("Analysis component: '", tweettext, "'") except UnicodeEncodeError, e: Print("UnicodeEncodeError", e) self.db_select( """SELECT duration FROM programmes_unique WHERE pid = %s""", (pid)) progdata = self.db_fetchone() duration = progdata[0] self.db_select( """SELECT totaltweets,meantweets,mediantweets,modetweets,stdevtweets,timediff,timestamp,utcoffset FROM programmes WHERE pid = %s ORDER BY timestamp DESC""", (pid)) progdata2 = self.db_fetchone() totaltweets = progdata2[0] # Increment the total tweets recorded for this programme's broadcast totaltweets += 1 meantweets = progdata2[1] mediantweets = progdata2[2] modetweets = progdata2[3] stdevtweets = progdata2[4] timediff = progdata2[5] timestamp = progdata2[6] utcoffset = progdata2[7] # Need to work out the timestamp to assign to the entry in analysed data progstart = timestamp - timediff progmins = int(progpos / 60) analysedstamp = int(progstart + (progmins * 60)) # Ensure that this tweet occurs within the length of the programme, otherwise for the purposes of this program it's useless if progpos > 0 and progpos <= duration: self.db_select( """SELECT did,totaltweets,wordfreqexpected,wordfrequnexpected FROM analyseddata WHERE pid = %s AND timestamp = %s""", (pid, analysedstamp)) analyseddata = self.db_fetchone() # Just in case of a missing raw json object (ie. programme terminated before it was stored - allow it to be skipped if not found after 30 secs) #failcounter = 0 # Pass this tweet to the NLTK analysis component self.send([pid, tweetid], "nltk") # print "BUM", 1 while not self.dataReady("nltk"): # if failcounter >= 3000: # nltkdata = list() # break time.sleep(0.01) # failcounter += 1 #if failcounter < 3000: # print "BUM", 2 if 1: # Receive back a list of words and their frequency for this tweet, including whether or not they are common, an entity etc nltkdata = self.recv("nltk") if analyseddata == None: # No tweets yet recorded for this minute minutetweets = 1 self.db_insert( """INSERT INTO analyseddata (pid,totaltweets,timestamp) VALUES (%s,%s,%s)""", (pid, minutetweets, analysedstamp)) for word in nltkdata: # Check if we're storing a word or phrase here if nltkdata[word][0] == 1: self.db_insert( """INSERT INTO wordanalysis (pid,timestamp,phrase,count,is_keyword,is_entity,is_common) VALUES (%s,%s,%s,%s,%s,%s,%s)""", (pid, analysedstamp, word, nltkdata[word][1], nltkdata[word][2], nltkdata[word][3], nltkdata[word][4])) else: self.db_insert( """INSERT INTO wordanalysis (pid,timestamp,word,count,is_keyword,is_entity,is_common) VALUES (%s,%s,%s,%s,%s,%s,%s)""", (pid, analysedstamp, word, nltkdata[word][1], nltkdata[word][2], nltkdata[word][3], nltkdata[word][4])) else: did = analyseddata[0] minutetweets = analyseddata[ 1] # Get current number of tweets for this minute minutetweets += 1 # Add one to it for this tweet self.db_update( """UPDATE analyseddata SET totaltweets = %s WHERE did = %s""", (minutetweets, did)) for word in nltkdata: # Check if we're storing a word or phrase if nltkdata[word][0] == 1: self.db_select( """SELECT wid,count FROM wordanalysis WHERE pid = %s AND timestamp = %s AND phrase LIKE %s""", (pid, analysedstamp, word)) # Check if this phrase has already been stored for this minute - if so, increment the count wordcheck = self.db_fetchone() if wordcheck == None: self.db_insert( """INSERT INTO wordanalysis (pid,timestamp,phrase,count,is_keyword,is_entity,is_common) VALUES (%s,%s,%s,%s,%s,%s,%s)""", (pid, analysedstamp, word, nltkdata[word][1], nltkdata[word][2], nltkdata[word][3], nltkdata[word][4])) else: self.db_update( """UPDATE wordanalysis SET count = %s WHERE wid = %s""", (nltkdata[word][1] + wordcheck[1], wordcheck[0])) else: self.db_select( """SELECT wid,count FROM wordanalysis WHERE pid = %s AND timestamp = %s AND word LIKE %s""", (pid, analysedstamp, word)) # Check if this word has already been stored for this minute - if so, increment the count wordcheck = self.db_fetchone() if wordcheck == None: self.db_insert( """INSERT INTO wordanalysis (pid,timestamp,word,count,is_keyword,is_entity,is_common) VALUES (%s,%s,%s,%s,%s,%s,%s)""", (pid, analysedstamp, word, nltkdata[word][1], nltkdata[word][2], nltkdata[word][3], nltkdata[word][4])) else: self.db_update( """UPDATE wordanalysis SET count = %s WHERE wid = %s""", (nltkdata[word][1] + wordcheck[1], wordcheck[0])) # Averages / stdev are calculated roughly based on the programme's running time at this point progdate = datetime.utcfromtimestamp( timestamp) + timedelta(seconds=utcoffset) actualstart = progdate - timedelta(seconds=timediff) actualtweettime = datetime.utcfromtimestamp(tweettime + utcoffset) # Calculate how far through the programme this tweet occurred runningtime = actualtweettime - actualstart runningtime = runningtime.seconds if runningtime < 0: runningtime = 0 else: runningtime = float(runningtime) / 60 try: meantweets = totaltweets / runningtime except ZeroDivisionError, e: meantweets = 0 self.db_select( """SELECT totaltweets FROM analyseddata WHERE pid = %s AND timestamp >= %s AND timestamp < %s""", (pid, progstart, analysedstamp + duration)) analyseddata = self.db_fetchall() runningtime = int(runningtime) tweetlist = list() for result in analyseddata: totaltweetsmin = result[0] # Create a list of each minute and the total tweets for that minute in the programme tweetlist.append(int(totaltweetsmin)) # Ensure tweetlist has enough entries # If a minute has no tweets, it won't have a database record, so this has to be added if len(tweetlist) < runningtime: additions = runningtime - len(tweetlist) while additions > 0: tweetlist.append(0) additions -= 1 # Order by programme position 0,1,2, mins etc tweetlist.sort() mediantweets = tweetlist[int(len(tweetlist) / 2)] modes = dict() stdevlist = list() for tweet in tweetlist: modes[tweet] = tweetlist.count(tweet) stdevlist.append( (tweet - meantweets) * (tweet - meantweets)) modeitems = [[v, k] for k, v in modes.items()] modeitems.sort(reverse=True) modetweets = int(modeitems[0][1]) stdevtweets = 0 for val in stdevlist: stdevtweets += val try: stdevtweets = math.sqrt(stdevtweets / runningtime) except ZeroDivisionError, e: stdevtweets = 0 # Finished analysis - update DB self.db_update( """UPDATE programmes SET totaltweets = %s, meantweets = %s, mediantweets = %s, modetweets = %s, stdevtweets = %s WHERE pid = %s AND timestamp = %s""", (totaltweets, meantweets, mediantweets, modetweets, stdevtweets, pid, timestamp))
def main(self): # Calculate running total and mean etc self.dbConnect() Print("FinalAnalysisNLTK: Initialised") while not self.finished(): Print("FinalAnalysisNLTK: Waiting for data") if self.dataReady("inbox"): data = self.recv("inbox") Print("FinalAnalysisNLTK: got data") Print( "FinalAnalysisNLTK: ... and ignoring it (Since didn't store results of final analysis anyway..." ) if False: pid = data[0] tweetids = data[1] retweetcache = dict() # Issue #TODO - Words that appear as part of a keyword but not the whole thing won't get marked as being a keyword (e.g. Blue Peter - two diff words) # Need to check for each word if it forms part of a phrase which is also a keyword # If so, don't count is as a word, count the whole thing as a phrase and remember not to count it more than once # May actually store phrases AS WELL AS keywords keywords = dict() # Find keywords for this PID self.db_select( """SELECT keyword,type FROM keywords WHERE pid = %s""", (pid)) keyworddata = self.db_fetchall() Print("FinalAnalysisNLTK: len(keyworddata)", len(keyworddata)) for word in keyworddata: wordname = word[0].lower() if "^" in wordname: wordbits = wordname.split("^") wordname = wordbits[0] wordbits = wordname.split() # Only looking at phrases here (more than one word) if len(wordbits) > 1: keywords[wordname] = word[1] filteredtext = list() Print( "FinalAnalysisNLTK: about to loop through tweet ids - count -", len(tweetids)) for tweetid in tweetids: # Cycle through each tweet and find its JSON tweetdata = None Print("FinalAnalysisNLTK: getting tweet data", len(tweetids)) tweetdatafailcount = 0 tweetfixfailcount = 0 failed_tweet = False while tweetdata == None: Print("FinalAnalysisNLTK: Trying to get tweetdata") self.db_select( """SELECT tweet_json FROM rawtweets WHERE tweet_id = %s""", (tweetid)) tweetdata = self.db_fetchone() if tweetdata == None: tweetdatafailcount += 1 Print( "FinalAnalysisNLTK: failed to tweetdata - count, id:", tweetdatafailcount, tweetid) if tweetdatafailcount > 1000: failed_tweet = True # FIXME: Without the following break, this goes into a busy wait loop, which locks # FIXME: up the scheduler because this component does not yield command back to the caller # FIXME: This results in starvation of the system because it does not allow any other # FIXME: process/component to progress. Since this has happened remarkably rarely, this # FIXME: means the collection & storage has been remarkably reliable. # FIXME: Should be able to actually fix this, and the mystery hangs now too. # FIXME: This "break" is actually sufficient in the short term, but means there's a hang # FIXME: of 1000 cycles -- if this entire block is uncommented. # FIXME: Since this problem happens rarely, bt catastrophically, fixing this really matters. # FIXME: Means it's especially surprising that *any* data was collected during this period, # FIXME: since this would cause the entire system - except other threads - to hang. # FIXME: Crucially, this is also something the zombie killer can't kill, but can (and did) # FIXME: draw attention to. break else: Print("FinalAnalysisNLTK: got tweetdata") tweetjson = cjson.decode(tweetdata[0]) self.send(tweetjson, "tweetfixer") twnc_count = 0 while not self.dataReady("tweetfixer"): if twnc_count > 10: # Empirically, twnc_count gets there within 2 or 3 loops break twnc_count += 1 self.pause() yield 1 if not self.dataReady("tweetfixer"): tweetfixfailcount += 1 Print( "FinalAnalysisNLTK: Tweet Fixer Failed - twnc_count, tweetfixfailcount", twnc_count, tweetfixfailcount, tweetid) else: tweetjson = self.recv("tweetfixer") # Identify retweets if tweetjson.has_key('retweeted_status'): if tweetjson[ 'retweeted_status'].has_key( 'id'): statusid = tweetjson[ 'retweeted_status']['id'] if retweetcache.has_key(statusid): retweetcache[statusid][0] += 1 else: retweetcache[statusid] = [ 1, tweetjson[ 'retweeted_status'] ['text'] ] tweettext = self.spellingFixer( tweetjson['filtered_text']).split() for word in tweettext: if word[0] in """!"#$%&()*+,-./:;<=>?@~[\\]?_'`{|}?""" and not ( len(word) <= 3 and (word[0] == ":" or word[0] == ";")): word = word[1:] if word != "": # Done twice to capture things like 'this is a "quote".' if len(word) >= 2: if word[len( word ) - 1] in """!"#$%&()*+,-./:;<=>?@~[\\]?_'`{|}?""" and word[ len(word) - 2:len( word )] != "s'" and not ( len(word) <= 3 and (word[0] == ":" or word[0] == ";")): word = word[:len(word) - 1] if word[len( word ) - 1] in """!"#$%&()*+,-./:;<=>?@~[\\]?_'`{|}?""" and word[ len(word) - 2:len( word )] != "s'" and not ( len(word) <= 3 and (word[0] == ":" or word[0] == ";")): word = word[:len(word ) - 1] elif word[len( word ) - 1] in """!"#$%&()*+,-./:;<=>?@~[\\]?_'`{|}?""" and not ( len(word) <= 3 and (word[0] == ":" or word[0] == ";")): word = word[:len(word) - 1] if word != "": if word[len( word ) - 1] in """!"#$%&()*+,-./:;<=>?@~[\\]?_'`{|}?""" and not ( len(word) <= 3 and (word[0] == ":" or word[0] == ";")): word = word[:len(word ) - 1] if word != "": if word in """!"#$%&()*+,-./:;<=>?@~[\\]?_'`{|}?""": word = "" if word != "": filteredtext.append(word) # Format: {"word" : [is_phrase,count,is_keyword,is_entity,is_common]} # Need to change this for retweets as they should include all the text content if truncated - need some clever merging FIXME TODO wordfreqdata = dict() # Look for phrases - very limited bigram_fd = FreqDist(nltk.bigrams(filteredtext)) # Print(bigram_fd) for entry in bigram_fd: if entry[0] not in """!"#$%&()*+,-./:;<=>?@~[\\]?_'`{|}?""" and entry[ 1] not in """!"#$%&()*+,-./:;<=>?@~[\\]?_'`{|}?""": if entry[0] not in self.exclusions and entry[ 1] not in self.exclusions: for word in keywords: # Print(word) if entry[0] in word and entry[1] in word: # Print("Keyword Match! " , entry[0],entry[1] ) break else: pass #Print( entry[0],entry[1]) # Print("Retweet data: " , retweetcache) self.send( None, "outbox" ) ## FIXME: AAAAAAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGHHHHHHHHHHHHHHHHHHHHHHH if not self.anyReady(): Print("FinalAnalysisNLTK: about to pause") self.pause() yield 1
def main(self): while True: time.sleep(1) Print("!", self.name)
def main(self): # Print("Entering main of the TwitterStream component", self) self.url = "https://stream.twitter.com/1/statuses/filter.json" self.headers = { "Accept-Encoding": "identity", "Keep-Alive": self.timeout, "Connection": "close", "User-Agent": "BBC R&D Grabber", "Content-Type": "application/x-www-form-urlencoded" } self.datacapture = None counter = 0 blanklinecount = 0 # Print("Entering main loop", self) while not self.finished(): if self.dataReady("inbox"): # Print("New data on inbox", self) if self.datacapture != None: # Print("We have a datacapture component, so need it to shutdown - call it's .stop() method ... (hmm, not correct really and would work with graphline...)", self) L = self.link((self, "_signal"), (self.datacapture, "control")) self.send(producerFinished(), "_signal") self.unlink(self.datacapture ) # Unlinks all linkages relating to this... # self.datacapture.stop() self.datacapture = None # Print("We now believe the subcomponent is dead. Probably erroneously...", self) recvdata = self.recv("inbox") keywords = recvdata[0] if len(keywords) > 400: keywords = keywords[0:400:1] pids = recvdata[1] safe_keywords = [ x.encode("utf8") for x in keywords ] # Needed to preclude unicode encoding issues in urllib... args = urllib.urlencode({"track": ",".join(safe_keywords)}) # Print ("Got keywords:", args) # Print("Create new datacapture component", self) self.connect(args, pids) # Print("Created...", self) while self.dataReady("tweetsin"): counter = 0 tweetdata = self.recv("tweetsin") if tweetdata[0] == "\r\n": blanklinecount += 1 else: blanklinecount = 0 self.send(tweetdata, "outbox") if self.dataReady("inbox"): break if not self.dataReady("tweetsin"): time.sleep(1) if self.datacapture != None: counter += 1 else: counter = 0 # This still isn't great at reducing busy wait CPU usage # Blank line count ensures we reconnect if we get 10 successive keepalives with no data - likely an error if (counter > self.timeout and self.datacapture != None and self.reconnect) or (blanklinecount >= 10 and self.reconnect): Print("counter", counter) Print("self.timeout", self.timeout) Print("self.datacapture", self.datacapture) Print("self.datacapture", self.datacapture.components) Print("self.reconnect", self.reconnect) Print("blanklinecount", blanklinecount) blanklinecount = 0 sys.stderr.write("API Connection Failed: Reconnecting") # import os # os.system("/home/michaels/Checkouts/kamaelia/trunk/Code/Python/Apps/SocialBookmarks/App/LastDitch.sh") # # sys.exit(0) # FIXME Brutal, but effective # self.scheduler.stop() # FIXME Brutal, but effective L = self.link((self, "_signal"), (self.datacapture, "control")) self.send(producerFinished(), "_signal") self.unlink(self.datacapture) self.datacapture = None # Twitter connection has failed counter = 0 self.connect(args, pids)
def doStuff(self, channel): # Check what's on for each channel self.send(channel, "whatson") while not self.dataReady("whatson"): pass data = self.recv("whatson") if data == None: pid = None else: pid = data[0] title = data[1] offset = data[2] duration = data[3] expectedstart = data[4] if pid != self.channels[channel]: # Perhaps just do a duplicate scan before creating Twitter stream if pid == None: self.channels[channel] = None Print (channel, ": Off Air") else: self.channels[channel] = pid self.send(["http://www.bbc.co.uk/programmes/" + pid + ".rdf"], "dataout") while not self.dataReady("datain"): pass recvdata = self.recv("datain") if recvdata[0] == "OK": programmedata = recvdata[1] else: # Fake programme data to prevent crash - not ideal programmedata = '<?xml version="1.0" encoding="utf-8"?> \ <rdf:RDF xmlns:rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" \ xmlns:rdfs = "http://www.w3.org/2000/01/rdf-schema#" \ xmlns:owl = "http://www.w3.org/2002/07/owl#" \ xmlns:foaf = "http://xmlns.com/foaf/0.1/" \ xmlns:po = "http://purl.org/ontology/po/" \ xmlns:mo = "http://purl.org/ontology/mo/" \ xmlns:skos = "http://www.w3.org/2008/05/skos#" \ xmlns:time = "http://www.w3.org/2006/time#" \ xmlns:dc = "http://purl.org/dc/elements/1.1/" \ xmlns:dcterms = "http://purl.org/dc/terms/" \ xmlns:wgs84_pos= "http://www.w3.org/2003/01/geo/wgs84_pos#" \ xmlns:timeline = "http://purl.org/NET/c4dm/timeline.owl#" \ xmlns:event = "http://purl.org/NET/c4dm/event.owl#"> \ </rdf:RDF>' # RDF reader needs to read from a file so write out first # Alternative is to read from a URL, but this lacks proper proxy support filepath = "tempRDF.txt" file = open(filepath, 'w') file.write(programmedata) file.close() g = Graph() # This is a temporary proxy fix. A URL could be put here instead g.parse("tempRDF.txt") # Identify the brand and whether there are any official hashtags twittags = list() for bid in g.subjects(object = rdflib.URIRef('http://purl.org/ontology/po/Brand')): # bid is Brand ID bidmod = bid.replace("#programme","") bidmod = str(bidmod.replace("file:///programmes/","")) if (bidmod in self.officialbrandtags): twittags = self.officialbrandtags[bidmod] break # Identify the series and whether there are any official hashtags if len(twittags) == 0: # Identify the brand and whether there are any official hashtags for sid in g.subjects(object = rdflib.URIRef('http://purl.org/ontology/po/Series')): # sid is Series ID sidmod = sid.replace("#programme","") sidmod = str(sidmod.replace("file:///programmes/","")) if (sidmod in self.officialseriestags): twittags = self.officialseriestags[sidmod] break vidmod = "" so = g.subject_objects(predicate=rdflib.URIRef('http://purl.org/ontology/po/version')) # Pick a version, any version - for this which one doesn't matter for x in so: # vid is version id vid = x[1] vidmod = vid.replace("#programme","") vidmod = vidmod.replace("file:///programmes/","") break # Got version, now get people self.send(["http://www.bbc.co.uk/programmes/" + vidmod + ".rdf"], "dataout") while not self.dataReady("datain"): pass recvdata = self.recv("datain") if recvdata[0] == "OK": versiondata = recvdata[1] else: versiondata = '<?xml version="1.0" encoding="utf-8"?> \ <rdf:RDF xmlns:rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" \ xmlns:rdfs = "http://www.w3.org/2000/01/rdf-schema#" \ xmlns:owl = "http://www.w3.org/2002/07/owl#" \ xmlns:foaf = "http://xmlns.com/foaf/0.1/" \ xmlns:po = "http://purl.org/ontology/po/" \ xmlns:mo = "http://purl.org/ontology/mo/" \ xmlns:skos = "http://www.w3.org/2008/05/skos#" \ xmlns:time = "http://www.w3.org/2006/time#" \ xmlns:dc = "http://purl.org/dc/elements/1.1/" \ xmlns:dcterms = "http://purl.org/dc/terms/" \ xmlns:wgs84_pos= "http://www.w3.org/2003/01/geo/wgs84_pos#" \ xmlns:timeline = "http://purl.org/NET/c4dm/timeline.owl#" \ xmlns:event = "http://purl.org/NET/c4dm/event.owl#"> \ </rdf:RDF>' filepath = "tempRDF.txt" file = open(filepath, 'w') file.write(versiondata) file.close() g = Graph() g.parse("tempRDF.txt") # Identify if this is a change of programme, or the first time we've checked what's on for Print clarity if self.firstrun: Print (channel , ": " + title) else: Print (channel , ": Changed to - " , title) # Minor alterations title = title.replace("&","and") if ":" in title: titlebits = title.split(":") title = titlebits[0] # Saving a copy here so apostrophes etc can be used in the Twitter people search titlesave = title # Remove punctuation for item in """!"#$%()*+,-./;<=>?@[\\]?_'`{|}?""": title = title.replace(item,"") keywords = dict() # Save keywords next to a descriptor of what they are keywords[pid] = "PID" # Add official hashtags to the list for tag in twittags: keywords[tag] = "Twitter" # Duplicates will be removed later if string.find(title,"The",0,3) != -1: newtitle = string.replace(re.sub("\s+","",title),"The ","",1) keywords[channel] = "Channel" keywords["#" + string.lower(re.sub("\s+","",title))] = "Title" # Check for and remove year too keywords["#" + string.replace(string.lower(re.sub("\s+","",title))," " + str(date.today().year),"",1)] = "Title" keywords['#' + string.lower(re.sub("\s+","",newtitle))] = "Title" # Check for and remove year too keywords['#' + string.replace(string.lower(re.sub("\s+","",newtitle))," " + str(date.today().year),"",1)] = "Title" else: keywords[channel] = "Channel" keywords["#" + string.lower(re.sub("\s+","",title))] = "Title" keywords["#" + string.replace(string.lower(re.sub("\s+","",title))," " + str(date.today().year),"",1)] = "Title" allwordtitle = string.replace(title,"The ","",1) allwordtitle = allwordtitle.lower() # Remove current year from events allwordtitle = allwordtitle.replace(" " + str(date.today().year),"",1) titlewords = allwordtitle.split() if len(titlewords) > 1: keywords[allwordtitle] = "Title" else: # Trial fix for issue of one word titles producing huge amounts of data keywords[allwordtitle + "^" + "bbc"] = "Title" keywords["#" + re.sub("\s+","",allwordtitle)] = "Title" numwords = dict({"one" : 1, "two" : 2, "three": 3, "four" : 4, "five": 5, "six" : 6, "seven": 7}) for word in numwords: if word in channel.lower() and channel != "asiannetwork": # Bug fix! asianne2rk numchannel = string.replace(channel.lower(),word,str(numwords[word])) keywords[numchannel] = "Channel" break if str(numwords[word]) in channel.lower(): numchannel = string.replace(channel.lower(),str(numwords[word]),word) keywords[numchannel] = "Channel" break # Load NameCache (people we've already searched for on Twitter to avoid hammering PeopleSearch) save = False try: homedir = os.path.expanduser("~") file = open(homedir + "/namecache.conf",'r') save = True except IOError: e = sys.exc_info()[1] Print ("Failed to load name cache - will attempt to create a new file: " , e) if save: raw_config = file.read() file.close() try: config = cjson.decode(raw_config) except cjson.DecodeError: e = sys.exc_info()[1] config = dict() else: config = dict() s = g.subjects(predicate=rdflib.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),object=rdflib.URIRef('http://purl.org/ontology/po/Role')) for x in s: rid = g.value(predicate=rdflib.URIRef('http://purl.org/ontology/po/role'),object=rdflib.BNode(x)) pid = g.value(subject=rdflib.BNode(rid),predicate=rdflib.URIRef('http://purl.org/ontology/po/participant')) firstname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/givenName'))) lastname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/familyName'))) if ((firstname + " " + lastname) in config): # Found a cached value if config[firstname + " " + lastname] != "": keywords[config[firstname + " " + lastname]] = "Twitter" else: # Not cached yet - new request self.send(firstname + " " + lastname, "search") while not self.dataReady("search"): pass twitdata = self.recv("search") screenname = "" try: for user in twitdata: # Only use this Twitter screen name if there's a good chance they're the person we're after if ("verified" in user): if (user['verified'] == True or user['followers_count'] > 10000) and string.lower(user['name']) == string.lower(firstname + " " + lastname): screenname = user['screen_name'] keywords[screenname] = "Twitter" break except AttributeError: pass config[firstname + " " + lastname] = screenname keywords[firstname + " " + lastname] = "Participant" s = g.subjects(predicate=rdflib.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),object=rdflib.URIRef('http://purl.org/ontology/po/Character')) for x in s: character = str(g.value(subject=rdflib.BNode(x),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/name'))) rid = g.value(predicate=rdflib.URIRef('http://purl.org/ontology/po/role'),object=rdflib.BNode(x)) pid = g.value(subject=rdflib.BNode(rid),predicate=rdflib.URIRef('http://purl.org/ontology/po/participant')) firstname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/givenName'))) lastname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/familyName'))) # This ^ is a temporary fix until I work out a better DB structure keywords[character + "^" + channel] = "Character" keywords[character + "^" + title] = "Character" if " " in character: # Looks like we have a firstname + surname situation charwords = character.split() if charwords[0] != "Dr" and charwords[0] != "Miss" and charwords[0] != "Mr" and charwords[0] != "Mrs" and charwords[0] != "Ms" and charwords[0] != "The": # As long as the first word isn't a title, add it as a first name # This ^ is a temporary fix until I work out a better DB structure keywords[charwords[0] + "^" + channel] = "Character" keywords[charwords[0] + "^" + title] = "Character" elif len(charwords) > 2: # If the first word was a title, and the second word isn't a surname (checked by > 2) add the first name # This ^ is a temporary fix until I work out a better DB structure keywords[charwords[1] + "^" + channel] = "Character" keywords[charwords[1] + "^" + title] = "Character" if ((firstname + " " + lastname) in config): # Found a cached value if config[firstname + " " + lastname] != "": keywords[config[firstname + " " + lastname]] = "Actor" else: # Not cached yet - new request self.send(firstname + " " + lastname, "search") while not self.dataReady("search"): pass twitdata = self.recv("search") screenname = "" try: for user in twitdata: if ("verified" in user): if (user['verified'] == True or user['followers_count'] > 10000) and string.lower(user['name']) == string.lower(firstname + " " + lastname): screenname = user['screen_name'] keywords[screenname] = "Twitter" break except AttributeError: pass config[firstname + " " + lastname] = screenname keywords[firstname + " " + lastname] = "Actor" # Radio appears to have been forgotten about a bit in RDF / scheduling at the mo # So, let's do some extra queries and see if the show title is a person's name on Twitter if "radio" in channel or "6music" in channel or "asiannetwork" in channel or "sportsextra" in channel or "worldservice" in channel: # However, radio shows are often named using the DJ - The cases where this isn't true will cause problems however as they'll be saved in json - DOH! TODO if (titlesave in config): # Found a cached value if config[titlesave] != "": keywords[config[titlesave]] = "Twitter" elif len(titlesave.split()) < 4: # Prevent some shows getting through at least - restricts people's names to three words self.send(titlesave, "search") while not self.dataReady("search"): pass twitdata = self.recv("search") screenname = "" try: for user in twitdata: if ("verified" in user): if (user['verified'] == True or user['followers_count'] > 10000) and string.lower(user['name']) == titlesave.lower(): screenname = user['screen_name'] keywords[screenname] = "Twitter" break except AttributeError: pass config[titlesave] = screenname try: file = open(homedir + "/namecache.conf",'w') raw_config = cjson.encode(config) file.write(raw_config) file.close() except IOError: Print ("Failed to save name cache - could cause rate limit problems") return [keywords,data] else: if pid == None: Print(channel , ": No change - Off Air") else: Print (channel , ": No change - " , title)
def main(self): self.dbConnect() while not self.finished(): twitdata = list() # As in the data collector, create a list of all tweets currently received while self.dataReady("inbox"): data = self.recv("inbox") twitdata.append(data[0]) if len(twitdata) > 0: # Cycle through the tweets, fixing their URLs as before, and storing them if they aren't a status message for tweet in twitdata: tweet = tweet.replace( "\\/", "/" ) # This may need moving further down the line - ideally it would be handled by cjson if tweet != "\r\n": newdata = cjson.decode(tweet) if newdata.has_key('delete') or newdata.has_key( 'scrub_geo') or newdata.has_key('limit'): # It is assumed here that the original data collector has handled the Twitter status message Print( "Discarding tweet instruction - captured by other component" ) else: tweetid = newdata['id'] # Capture exactly when this tweet was stored tweetstamp = time.time() tweetsecs = int(tweetstamp) # Include the fractions of seconds portion of the timestamp in a separate field tweetfrac = tweetstamp - tweetsecs # We only have a 16000 VARCHAR field to use in MySQL (through choice) - this should be enough, but if not, the tweet will be written out to file if len(tweet) < 16000: try: self.db_insert( """INSERT INTO rawtweets (tweet_id,tweet_json,tweet_stored_seconds,tweet_stored_fraction) VALUES (%s,%s,%s,%s)""", (tweetid, tweet, tweetsecs, tweetfrac)) except _mysql_exceptions.IntegrityError, e: # Handle the possibility for Twitter having sent us a duplicate Print("Duplicate tweet ID:", e) else: Print( "Discarding tweet - length limit exceeded") tweetcontents = "" homedir = os.path.expanduser("~") if os.path.exists(homedir + "/oversizedtweets.conf"): try: file = open( homedir + "/oversizedtweets.conf", 'r') tweetcontents = file.read() file.close() except IOError, e: Print( "Failed to load oversized tweet cache - it will be overwritten" ) try: file = open( homedir + "/oversizedtweets.conf", 'w') tweetcontents = tweetcontents + tweet file.write(tweetcontents) file.close() except IOError, e: Print( "Failed to save oversized tweet cache")
except urllib2.HTTPError, e: Print("PeopleSearch HTTP error:", e.code) # sys.stderr.write('PeopleSearch HTTP error: ' + str(e.code) + '\n') conn1 = False except urllib2.URLError, e: Print("PeopleSearch URL error: ", e.reason) # sys.stderr.write('PeopleSearch URL error: ' + str(e.reason) + '\n') conn1 = False if conn1: content = conn1.read() conn1.close() request_token = dict(urlparse.parse_qsl(content)) Print("Request Token:") Print(" - oauth_token = ", request_token['oauth_token']) Print(" - oauth_token_secret = ", request_token['oauth_token_secret']) Print("") # The user must confirm authorisation so a URL is Printed here Print("Go to the following link in your browser:") Print("%s?oauth_token=%s" % (authorize_url, request_token['oauth_token'])) Print("") accepted = 'n' # Wait until the user has confirmed authorisation while accepted.lower() == 'n':
def main(self): self.dbConnect() oldkeywords = None while not self.finished(): Print ("### Checking current programmes ###") if self.channel != "all": oldpid = self.channels[self.channel] if oldpid == None: self.db_update("""UPDATE programmes SET imported = 1 WHERE channel = %s""",(self.channel)) data = self.doStuff(self.channel) if data != None: keywords = data[0] pid = data[1][0] title = data[1][1] offset = data[1][2] duration = data[1][3] timestamp = data[1][4] utcoffset = data[1][5] self.db_update("""UPDATE programmes SET imported = 1 WHERE pid != %s AND channel = %s""",(pid,self.channel)) self.db_select("""SELECT channel FROM programmes WHERE pid = %s AND timestamp = %s""",(pid,timestamp)) progentrytest = self.db_fetchone() self.db_select("""SELECT duration FROM programmes_unique WHERE pid = %s""",(pid)) progtest2 = self.db_fetchone() if progentrytest == None: self.db_insert("""INSERT INTO programmes (pid,timediff,timestamp,utcoffset,channel) VALUES (%s,%s,%s)""", (pid,offset,timestamp,utcoffset,self.channel)) if progtest2 == None: self.db_insert("""INSERT INTO programmes_unique (pid,title,duration) VALUES (%s,%s,%s)""", (pid,title,duration)) for word in keywords: self.db_insert("""INSERT INTO keywords (pid,keyword,type) VALUES (%s,%s,%s)""", (pid,word,keywords[word])) else: # Fix for programmes where the duration is changed last minute if progtest2[0] < duration: #self.db_update("""UPDATE programmes SET duration = %s WHERE pid = %s AND timestamp = %s""",(duration,pid,timestamp)) self.db_update("""UPDATE programmes_unique SET duration = %s WHERE pid = %s""",(duration,pid)) keywords = list() else: keywords = None self.db_select("""SELECT keyword FROM keywords WHERE pid = %s""",(pid)) keywordquery = self.db_fetchall() for keyword in keywordquery: # This ^ is a temporary fix until I work out a better DB structure if "^" in keyword[0]: keywords.append(string.replace(keyword[0],"^"," ")) else: keywords.append(keyword[0]) if (keywords != oldkeywords) & (keywords != None): Print(keywords) self.send([keywords,[pid]],"outbox") pass else: # Still need to fix the 'changed to - off air' problem, but it isn't causing twitter keyword redos thankfully (purely a Printing error) # Possible issue will start to occur if programmes change too often - tweet stream will miss too much keywords = list() for channel in self.channels: oldpid = self.channels[channel] if oldpid == None: self.db_update("""UPDATE programmes SET imported = 1 WHERE channel = %s""",(channel)) data = self.doStuff(channel) if data != None: keywordappender = data[0] pid = data[1][0] title = data[1][1] offset = data[1][2] duration = data[1][3] timestamp = data[1][4] utcoffset = data[1][5] self.db_update("""UPDATE programmes SET imported = 1 WHERE pid != %s AND channel = %s""",(pid,channel)) self.db_select("""SELECT channel FROM programmes WHERE pid = %s AND timestamp = %s""",(pid,timestamp)) progentrytest = self.db_fetchone() self.db_select("""SELECT duration FROM programmes_unique WHERE pid = %s""",(pid)) progtest2 = self.db_fetchone() if progentrytest == None: self.db_insert("""INSERT INTO programmes (pid,timediff,timestamp,utcoffset,channel) VALUES (%s,%s,%s,%s,%s)""", (pid,offset,timestamp,utcoffset,channel)) if progtest2 == None: self.db_insert("""INSERT INTO programmes_unique (pid,title,duration) VALUES (%s,%s,%s)""", (pid,title,duration)) for word in keywordappender: self.db_insert("""INSERT INTO keywords (pid,keyword,type) VALUES (%s,%s,%s)""", (pid,word,keywordappender[word])) else: # Fix for programmes where the duration is changed last minute if progtest2[0] < duration: #self.db_update("""UPDATE programmes SET duration = %s WHERE pid = %s AND timestamp = %s""",(duration,pid,timestamp)) self.db_update("""UPDATE programmes_unique SET duration = %s WHERE pid = %s""",(duration,pid)) currentpids = list() for channel in self.channels: if self.channels[channel] != "" and self.channels[channel] != None: currentpids.append(self.channels[channel]) for pid in currentpids: self.db_select("""SELECT keyword FROM keywords WHERE pid = %s""",(pid)) keywordquery = self.db_fetchall() for keyword in keywordquery: # This ^ is a temporary fix until I work out a better DB structure if "^" in keyword[0]: keywords.append(string.replace(keyword[0],"^"," ")) else: keywords.append(keyword[0]) # Remove repeated keywords here if len(keywords) != 0: keywords = list(set(keywords)) if (keywords != oldkeywords) & (len(keywords) != 0): Print(keywords) self.send([keywords,currentpids],"outbox") #epicfail: now need to send all pids, and search through them further down the line pass oldkeywords = keywords # At this point, find the version tags to allow further info finding # Then, pass keywords to TwitterStream. DataCollector will pick up the data # Must deal with errors passed back from TwitterStream here self.firstrun = False time.sleep(30) # Wait for 30 secs - don't need as much given the wait time between /programmes requests # Could always get this to wait until the programme is due to change, but this *may* miss last minute schedule changes
def main(self): Print( "Pausing", self.tag ) self.pause(1) Print( "Pausing", self.tag ) self.send(producerFinished(), "signal")
def ShouldWeRestartBookmarks(historical_status): deltas = { "keywords": [], "wordanalysis": [], "rawdata": [], "analyseddata": [], "timestamp": [] } k = 1 if len(historical_status) < 2: # Not enough information return False while k < len(historical_status): deltas["keywords"].append(historical_status[k]["keywords"] - historical_status[k - 1]["keywords"]) deltas["wordanalysis"].append(historical_status[k]["wordanalysis"] - historical_status[k - 1]["wordanalysis"]) deltas["rawdata"].append(historical_status[k]["rawdata"] - historical_status[k - 1]["rawdata"]) deltas["analyseddata"].append(historical_status[k]["analyseddata"] - historical_status[k - 1]["analyseddata"]) deltas["timestamp"].append( int(historical_status[k]["timestamp"] - historical_status[k - 1]["timestamp"])) k += 1 import pprint pprint.pprint(deltas) last_hour_schedule_activity = sum(deltas["keywords"][-6:]) last_2periods_tweet_collation_activity = sum(deltas["rawdata"][-2:]) last_2periods_wordanalysis_activity = sum(deltas["wordanalysis"][-2:]) last_2periods_analyseddata_activity = sum(deltas["analyseddata"][-2:]) all_current_activity = deltas["keywords"][-1], deltas["rawdata"][ -1], deltas["wordanalysis"][-1], deltas["analyseddata"][-1] if all_current_activity == 0: Print( "Bookmarks.py is showing no activity at all, of any kind, very likey dead" ) return True if last_hour_schedule_activity == 0: if len(deltas["keywords"]) > 5: Print( "Looks like schedule collation in Bookmarks.py has died, needs restart" ) return True if last_2periods_tweet_collation_activity == 0: if len(deltas["rawdata"]) > 1: Print( "Looks like tweet collection activity in Bookmarks.py has died, needs restart" ) return True if last_2periods_wordanalysis_activity == 0: if len(deltas["wordanalysis"]) > 1: Print( "Looks like word analysis of tweets - activity - in Bookmarks.py has died, needs restart" ) return True if last_2periods_analyseddata_activity == 0: if len(deltas["analyseddata"]) > 1: Print( "No tweets analysed in 2 periods. In all likelihood analysis subsystem in Bookmarks.py has died, needs restart" ) return True # Warnings if deltas["rawdata"][-1] == 0: Print( "WARNING - no tweets collected in 1 period, might be dead. Waiting 1 period" ) if deltas["wordanalysis"][-1] == 0: Print( "WARNING - no tweet words analysed in 1 period, might be dead. Waiting 1 period" ) if deltas["analyseddata"][-1] == 0: Print( "WARNING - no tweets analysed in 1 period, might be dead. Waiting 1 period" ) return False
# Finished analysis - update DB self.db_update( """UPDATE programmes SET totaltweets = %s, meantweets = %s, mediantweets = %s, modetweets = %s, stdevtweets = %s WHERE pid = %s AND timestamp = %s""", (totaltweets, meantweets, mediantweets, modetweets, stdevtweets, pid, timestamp)) else: pass # Print("Analysis component: Skipping tweet - falls outside the programme's running time") # Mark the tweet as analysed self.db_update( """UPDATE rawdata SET analysed = 1 WHERE tid = %s""", (tid)) Print("Analysis component: Done!") # Stage 2: If all raw tweets analysed and imported = 1 (all data for this programme stored and programme finished), finalise the analysis - could do bookmark identification here too? self.db_select( """SELECT pid,totaltweets,meantweets,mediantweets,modetweets,stdevtweets,timestamp,timediff FROM programmes WHERE imported = 1 AND analysed = 0 LIMIT 5000""" ) data = self.db_fetchall() # Cycle through each programme that's ready for final analysis for result in data: pid = result[0] self.db_select( """SELECT duration,title FROM programmes_unique WHERE pid = %s""", (pid)) data2 = self.db_fetchone() if not data2: Print("Getting data for duration,title, etc failed - pid",