Beispiel #1
0
 def db_insert(self, command, args):
     if 1:  # Debugging inserts - specifically loooking for tweets not inserted *and* inserted
         caller = inspect.stack()[1]
         Print("DBWrapper", int(caller[2]), caller[3], caller[1], command,
               args)
     self.cursor.execute(command, args)  #xyz
     if 1:
         self.cursor_dupe.execute(command, args)  #xyz
     Print("DBWrapper Insert Completed")
Beispiel #2
0
 def db_select(self, command, args=None):
     if 1:  # Debugging inserts - specifically loooking for tweets not inserted *and* inserted
         caller = inspect.stack()[1]
         Print("DBWrapper", int(caller[2]), caller[3], caller[1], command,
               args)
     if args:
         self.cursor.execute(command, args)  #xyz
     else:
         self.cursor.execute(command)  #xyz
     Print("DBWrapper Select Completed")
Beispiel #3
0
def store_historical_status(historical_status):
    serialised_data = cjson.encode(historical_status)
    try:
        f = open("historical_status.json", "wb")
        f.write(serialised_data)
        f.close()
    except Exception, e:
        # Doesn't really matter what the problem is. It failed, and there's nothing this code can do about it.
        Print(
            "Failed to WRITE historical_status.json, something is badly broken"
        )
        Print("Exception was: ", e)
Beispiel #4
0
    def main(self):
        twitterurl = "http://api.twitter.com/1/users/search.json"

        if self.proxy:
            proxyhandler = urllib2.ProxyHandler({"http": self.proxy})
            twitopener = urllib2.build_opener(proxyhandler)
            urllib2.install_opener(twitopener)

        headers = {'User-Agent': "BBC R&D Grabber"}
        postdata = None

        if self.keypair == False:
            # Perform OAuth authentication - as we don't have the secret key pair we need to request it
            # This will require some user input
            request_token_url = 'http://api.twitter.com/oauth/request_token'
            access_token_url = 'http://api.twitter.com/oauth/access_token'
            authorize_url = 'http://api.twitter.com/oauth/authorize'

            token = None
            consumer = oauth.Consumer(key=self.consumerkeypair[0],
                                      secret=self.consumerkeypair[1])

            params = {
                'oauth_version': "1.0",
                'oauth_nonce': oauth.generate_nonce(),
                'oauth_timestamp': int(time.time()),
            }

            params['oauth_consumer_key'] = consumer.key

            req = oauth.Request(method="GET",
                                url=request_token_url,
                                parameters=params)

            signature_method = oauth.SignatureMethod_HMAC_SHA1()
            req.sign_request(signature_method, consumer, token)

            requestheaders = req.to_header()
            requestheaders['User-Agent'] = "BBC R&D Grabber"

            # Connect to Twitter
            try:
                req = urllib2.Request(
                    request_token_url, None, requestheaders
                )  # Why won't this work?!? Is it trying to POST?
                conn1 = urllib2.urlopen(req)
            except httplib.BadStatusLine, e:
                Print("PeopleSearch BadStatusLine error:", e)
                conn1 = False
            except urllib2.HTTPError, e:
                Print("PeopleSearch HTTP error:", e.code)
                #                sys.stderr.write('PeopleSearch HTTP error: ' + str(e.code) + '\n')
                conn1 = False
Beispiel #5
0
def get_historical_status():

    try:
        f = open("historical_status.json")
        raw_data = f.read()
        f.close()
        historical_status = cjson.decode(raw_data)
    except Exception, e:
        # Doesn't really matter what the problem is. It failed, warn, and use a default.
        Print("Failed to read historical_status.json, creating a new one")
        Print("Exception was: ", e)
        historical_status = []
Beispiel #6
0
 def main(self):
     c = 0
     while True:
         if os.path.exists(self.stopfile):
             Print("Stop File Exists - Exitting")
             self.die()
         time.sleep(1)
Beispiel #7
0
 def __init__(self, *argv, **argd):
     # This is to ensure that we play nicely inside a general hierarchy
     # Even though we inherit frm object. Otherwise we risk breaking the MRO of the class
     # We're used with.
     # These should all succeed now...
     Print("db.user, pass, maindb, nextdb", self.dbuser, self.dbpass,
           self.maindb, self.nextdb)
     super(DBWrapper, self).__init__(*argv, **argd)
     # Now configured centrally, but can be still overridden in the usual kamaelia way :-)
     self.cursor = None  # xyz # dupe
     self.cursor_dupe = None  # xyz #dupe
Beispiel #8
0
def killBookmarksProcess():
    processline = os.popen(
        'ps 2>/dev/null -aux|grep Bookmarks.py|grep -v grep').read()
    if processline:
        tokens = processline.split()
        user, pid, pcpu, pmem, vsz, rss, tty, stat, start, time, command = tokens[:
                                                                                  11]
        args = tokens[11:]
        os.system("kill -9 %s" % pid)
    else:
        Print(
            "Bookmarks.py is not running. This means the shell process wrapping it isn't starting it, or is dead. This program cannot fix that problem."
        )
    def main(self):
#        sys.stdout.write("Handle Connect Request Start\n")
        self.control_message = None
        self.had_response = False
        buff = Linebuffer()
        lines = []
        fail = False
        try:
            while True:
                for data in self.Inbox("inbox"):
                    buff.feed(data)
                while buff.chompable():
                    line = buff.chomp()
                    lines.append(line)
                    if line == "\r\n":
                        # We've now got the complete header.
                        # We're now expecting a body, but SHOULD handle it.
                        # For now, let's just handle the response line since it's all we really care about.
                        rest = lines[1:]
                        rline = lines[0]

                        p = rline.find(" ") 
                        if p == -1:
                            raise GeneralFail("HTTP Response Line Parse Failure: "+ repr(http_response_line))
                        http_version = rline[:p]
                        rline = rline[p+1:]

                        p = rline.find(" ") 
                        if p == -1:
                            raise GeneralFail("HTTP Response Line Parse Failure: "+ repr(rline))
                        http_status = rline[:p]
                        human_status = rline[p+1:]

                        if 0:
                            Print ("http_version,http_status,human_status",http_version,http_status,human_status)

                        if http_status != "200":
                            raise GeneralFail("HTTP Connect Failure : "+ repr(rline))

                    self.had_response = True

                self.checkControl()

                if not self.anyReady():
                    self.pause()
                yield 1
        except ShutdownNow:
            pass
        except GeneralFail, e:
            # Yes, we're masking an error. This is nasty.
            fail = True
    def main(self):
        try:
            while True:
                for data in self.Inbox("inbox"):
                    self.send(self.tag + " : " + str(data),  "outbox")

                for data in self.Inbox("togglebox"):
                    Print( "toggling" )
                    self.tag = self.tag[-1::-1] # Reverse it.
                
                self.checkControl()
                
                if not self.anyReady():
                    self.pause()
                yield 1
        except ShutdownNow:
            pass

        Print( "exitting tagger" )
        if self.control_message:
            self.send(self.control_message, "signal")
        else:
            self.send(Axon.Ipc.producerFinished(), "signal")
Beispiel #11
0
    def main(self):
        while not self.finished():
            if self.dataReady("inbox"):
                # TODO This component is unfinished as it was never found to be needed
                channel = self.recv("inbox")
                time.sleep(
                    1)  # Temporary delay to ensure not hammering /programmes
                nowplayingurl = "http://www.bbc.co.uk" + self.channels[
                    channel] + ".json"

                npdata = None
                # Grab BBC data
                self.send([nowplayingurl], "dataout")
                while not self.dataReady("datain"):
                    yield 1
                    self.pause()
                recvdata = self.recv("datain")
                if recvdata[0] == "OK":
                    content = recvdata[1]
                else:
                    content = None

                # Read and decode now playing info
                if content != None:
                    try:
                        decodedcontent = cjson.decode(content)
                    except cjson.DecodeError:
                        e = sys.exc_info()[1]
                        Print("cjson.DecodeError:", e.message)

                # Analyse now playing info
                if decodedcontent:
                    # Not finished! - now playing json file is empty if nothing is playing!
                    npdata = False

                self.send(npdata, "outbox")
            self.pause()
            yield 1
Beispiel #12
0
def MySQL_Running():
    processline = os.popen(
        'ps 2>/dev/null -aux|grep mysqld|grep -v grep').read()
    if processline:
        return True
    else:
        return False


try:
    homedir = os.path.expanduser(
        "~")  # Bootstrap from /root/, but use this to find the rest
    config_file = open(homedir + "/twitter-login.conf")
except IOError, e:
    Print("Failed to load login data - exiting")
    sys.exit(0)

raw_config = config_file.read()
config_file.close()
config = cjson.decode(raw_config)

username = config['dbuser']
password = config['dbpass']
unixuser = config['unixuser']
homedir = os.path.expanduser("~" + unixuser)

os.system("touch " + homedir + "/stop_bookmarks")

time.sleep(1)
Beispiel #13
0
            "WARNING - no tweet words analysed in 1 period, might be dead. Waiting 1 period"
        )
    if deltas["analyseddata"][-1] == 0:
        Print(
            "WARNING - no tweets analysed in 1 period, might be dead. Waiting 1 period"
        )

    return False


try:
    homedir = os.path.expanduser(
        "~")  # Bootstrap from /root/, but use this to find the rest
    config_file = open(homedir + "/twitter-login.conf")
except IOError, e:
    Print("Failed to load login data - exiting")
    sys.exit(0)

raw_config = config_file.read()
config_file.close()
config = cjson.decode(raw_config)

username = config['dbuser']
password = config['dbpass']
unixuser = config['unixuser']
homedir = os.path.expanduser("~" + unixuser)

state = get_database_state()
historical_status = get_historical_status()
add_current_state(historical_status, state)
store_historical_status(historical_status)
Beispiel #14
0
    def main(self):
        twitterurl = "http://api.twitter.com/1/users/search.json"

        if self.proxy:
            proxyhandler = urllib2.ProxyHandler({"http": self.proxy})
            twitopener = urllib2.build_opener(proxyhandler)
            urllib2.install_opener(twitopener)

        headers = {'User-Agent': "BBC R&D Grabber"}
        postdata = None

        if self.keypair == False:
            # Perform OAuth authentication - as we don't have the secret key pair we need to request it
            # This will require some user input
            request_token_url = 'http://api.twitter.com/oauth/request_token'
            access_token_url = 'http://api.twitter.com/oauth/access_token'
            authorize_url = 'http://api.twitter.com/oauth/authorize'

            token = None
            consumer = oauth.Consumer(key=self.consumerkeypair[0],
                                      secret=self.consumerkeypair[1])

            params = {
                'oauth_version': "1.0",
                'oauth_nonce': oauth.generate_nonce(),
                'oauth_timestamp': int(time.time()),
            }

            params['oauth_consumer_key'] = consumer.key

            req = oauth.Request(method="GET",
                                url=request_token_url,
                                parameters=params)

            signature_method = oauth.SignatureMethod_HMAC_SHA1()
            req.sign_request(signature_method, consumer, token)

            requestheaders = req.to_header()
            requestheaders['User-Agent'] = "BBC R&D Grabber"

            # Connect to Twitter
            try:
                req = urllib2.Request(
                    request_token_url, None, requestheaders
                )  # Why won't this work?!? Is it trying to POST?
                conn1 = urllib2.urlopen(req)
            except httplib.BadStatusLine:
                e = sys.exc_info()[1]
                Print("PeopleSearch BadStatusLine error:", e)
                conn1 = False
            except urllib2.HTTPError:
                e = sys.exc_info()[1]
                Print("PeopleSearch HTTP error:", e.code)
                #                sys.stderr.write('PeopleSearch HTTP error: ' + str(e.code) + '\n')
                conn1 = False
            except urllib2.URLError:
                e = sys.exc_info()[1]
                Print("PeopleSearch URL error: ", e.reason)
                #                sys.stderr.write('PeopleSearch URL error: ' + str(e.reason) + '\n')
                conn1 = False

            if conn1:
                content = conn1.read()
                conn1.close()

                request_token = dict(urlparse.parse_qsl(content))

                Print("Request Token:")
                Print("     - oauth_token        = ",
                      request_token['oauth_token'])
                Print("     - oauth_token_secret = ",
                      request_token['oauth_token_secret'])
                Print("")

                # The user must confirm authorisation so a URL is Printed here
                Print("Go to the following link in your browser:")
                Print("%s?oauth_token=%s" %
                      (authorize_url, request_token['oauth_token']))
                Print("")

                accepted = 'n'
                # Wait until the user has confirmed authorisation
                while accepted.lower() == 'n':
                    accepted = raw_input('Have you authorized me? (y/n) ')
                oauth_verifier = raw_input('What is the PIN? ')

                token = oauth.Token(request_token['oauth_token'],
                                    request_token['oauth_token_secret'])
                token.set_verifier(oauth_verifier)

                params = {
                    'oauth_version': "1.0",
                    'oauth_nonce': oauth.generate_nonce(),
                    'oauth_timestamp': int(time.time()),
                }

                params['oauth_token'] = token.key
                params['oauth_consumer_key'] = consumer.key

                req = oauth.Request(method="GET",
                                    url=access_token_url,
                                    parameters=params)

                signature_method = oauth.SignatureMethod_HMAC_SHA1()
                req.sign_request(signature_method, consumer, token)

                requestheaders = req.to_header()
                requestheaders['User-Agent'] = "BBC R&D Grabber"
                # Connect to Twitter
                try:
                    req = urllib2.Request(
                        access_token_url, "oauth_verifier=%s" % oauth_verifier,
                        requestheaders
                    )  # Why won't this work?!? Is it trying to POST?
                    conn1 = urllib2.urlopen(req)
                except httplib.BadStatusLine:
                    e = sys.exc_info()[1]
                    #                    sys.stderr.write('PeopleSearch BadStatusLine error: ' + str(e) + '\n')
                    Print('PeopleSearch BadStatusLine error: ', e)
                    conn1 = False
                except urllib2.HTTPError:
                    e = sys.exc_info()[1]
                    Print('PeopleSearch HTTP error: ', e.code)
                    conn1 = False
                except urllib2.URLError:
                    e = sys.exc_info()[1]
                    #                    sys.stderr.write('PeopleSearch URL error: ' + str(e.reason) + '\n')
                    Print('PeopleSearch URL error: ', e.reason)
                    conn1 = False

                if conn1:
                    content = conn1.read()
                    conn1.close()
                    access_token = dict(urlparse.parse_qsl(content))

                    # Access tokens retrieved from Twitter
                    Print("Access Token:")
                    Print("     - oauth_token        = ",
                          access_token['oauth_token'])
                    Print("     - oauth_token_secret = ",
                          access_token['oauth_token_secret'])
                    Print("")
                    Print(
                        "You may now access protected resources using the access tokens above."
                    )
                    Print("")

                    save = False
                    # Load config to save OAuth keys
                    try:
                        homedir = os.path.expanduser("~")
                        file = open(homedir + "/twitter-login.conf", 'r')
                        save = True
                    except IOError:
                        e = sys.exc_info()[1]
                        Print(
                            "Failed to load config file - not saving oauth keys: ",
                            e)

                    if save:
                        raw_config = file.read()

                        file.close()

                        # Read config and add new values
                        config = cjson.decode(raw_config)
                        config['key'] = access_token['oauth_token']

                        config['secret'] = access_token['oauth_token_secret']

                        raw_config = cjson.encode(config)

                        # Write out the new config file
                        try:
                            file = open(homedir + "/twitter-login.conf", 'w')
                            file.write(raw_config)
                            file.close()
                        except IOError:
                            e = sys.exc_info()[1]
                            Print("Failed to save oauth keys: ", e)

                    self.keypair = [
                        access_token['oauth_token'],
                        access_token['oauth_token_secret']
                    ]

        while not self.finished():
            # TODO: Implement backoff algorithm in case of connection failures - watch out for the fact this could delay the requester component
            if self.dataReady("inbox"):
                # Retieve keywords to look up
                person = self.recv("inbox")

                # Ensure we're not rate limited during the first request - if so we'll wait for 15 mins before our next request
                if (datetime.today() -
                        timedelta(minutes=15)) > self.ratelimited:
                    requesturl = twitterurl + "?q=" + urllib.quote(
                        person) + "&per_page=5"

                    params = {
                        'oauth_version': "1.0",
                        'oauth_nonce': oauth.generate_nonce(),
                        'oauth_timestamp': int(time.time()),
                    }

                    token = oauth.Token(key=self.keypair[0],
                                        secret=self.keypair[1])
                    consumer = oauth.Consumer(key=self.consumerkeypair[0],
                                              secret=self.consumerkeypair[1])

                    params['oauth_token'] = token.key
                    params['oauth_consumer_key'] = consumer.key

                    req = oauth.Request(method="GET",
                                        url=requesturl,
                                        parameters=params)

                    signature_method = oauth.SignatureMethod_HMAC_SHA1()
                    req.sign_request(signature_method, consumer, token)

                    requestheaders = req.to_header()
                    requestheaders['User-Agent'] = "BBC R&D Grabber"

                    # Connect to Twitter
                    try:
                        req = urllib2.Request(
                            requesturl, None, requestheaders
                        )  # Why won't this work?!? Is it trying to POST?
                        conn1 = urllib2.urlopen(req)
                    except httplib.BadStatusLine:
                        e = sys.exc_info()[1]
                        #                        sys.stderr.write('PeopleSearch BadStatusLine error: ' + str(e) + '\n')
                        Print('PeopleSearch BadStatusLine error: ', e)
                        conn1 = False
                    except urllib2.HTTPError:
                        e = sys.exc_info()[1]
                        #                        sys.stderr.write('PeopleSearch HTTP error: ' + str(e.code) + '\n')
                        Print('PeopleSearch HTTP error: ', e.code)
                        conn1 = False
                    except urllib2.URLError:
                        e = sys.exc_info()[1]
                        #                        sys.stderr.write('PeopleSearch URL error: ' + str(e.reason) + '\n')
                        Print('PeopleSearch URL error: ', e.reason)
                        conn1 = False

                    if conn1:
                        # Check rate limiting here and Print current limit
                        headers = conn1.info()
                        try:
                            headerlist = string.split(str(headers), "\n")
                        except UnicodeEncodeError:  # str may fail...
                            headerlist = []
                        for line in headerlist:
                            if line != "":
                                splitheader = line.split()
                                if splitheader[
                                        0] == "X-FeatureRateLimit-Remaining:" or splitheader[
                                            0] == "X-RateLimit-Remaining:":
                                    Print(splitheader[0], " ", splitheader[1])
                                    if int(splitheader[1]) < 5:
                                        self.ratelimited = datetime.today()
                        # Grab json format result of people search here
                        try:
                            data = conn1.read()
                            try:
                                content = cjson.decode(data)
                                self.send(content, "outbox")
                            except cjson.DecodeError:
                                self.send(dict(), "outbox")
                        except IOError:
                            e = sys.exc_info()[1]
                            #                            sys.stderr.write('PeopleSearch IO error: ' + str(e) + '\n')
                            Print('PeopleSearch IO error: ', e)
                            self.send(dict(), "outbox")
                        conn1.close()
                    else:
                        self.send(dict(), "outbox")
                else:
                    Print("Twitter search paused - rate limited")
                    self.send(dict(), "outbox")
            self.pause()
            yield 1
Beispiel #15
0
    def main(self):
        while not self.finished():
            if self.dataReady("inbox"):
                channel = self.recv("inbox")
                time.sleep(
                    1)  # Temporary delay to ensure not hammering /programmes

                # Setup in case of URL errors later
                data = None

                # Define URLs for getting schedule data and DVB bridge information
                # By BBC convention, schedule info runs to 5am the next day
                if datetime.utcnow().hour < 5:
                    scheduleurl = "http://www.bbc.co.uk" + self.channels[
                        channel][1] + "/" + time.strftime(
                            "%Y/%m/%d",
                            time.gmtime(time.time() - 86400)) + ".json"
                else:
                    scheduleurl = "http://www.bbc.co.uk" + self.channels[
                        channel][1] + "/" + time.strftime(
                            "%Y/%m/%d", time.gmtime(time.time())) + ".json"
                #syncschedurl = "http://beta.kamaelia.org:8082/dvb-bridge?command=channel&args=" + urllib.quote(self.channels[channel][0])
                #synctimeurl = "http://beta.kamaelia.org:8082/dvb-bridge?command=time"
                syncschedurl = "http://10.92.164.147:8082/dvb-bridge?command=channel&args=" + urllib.quote(
                    self.channels[channel][0])
                synctimeurl = "http://10.92.164.147:8082/dvb-bridge?command=time"

                content = None
                #                # Grab SyncTV time data to work out the offset between local (NTP) and BBC time (roughly)
                #                self.send([synctimeurl], "dataout")
                #                while not self.dataReady("datain"):
                #                    self.pause()
                #                    yield 1
                #                recvdata = self.recv("datain")
                #                if recvdata[0] == "OK":
                #                    content = recvdata[1]
                #                else:
                #                    content = None

                # Work out time difference between local time and BBC time
                if content != None:
                    try:
                        decodedcontent = cjson.decode(content)
                        if decodedcontent[0] == "OK":
                            difference = time.time(
                            ) - decodedcontent[2]['time']
                    except cjson.DecodeError, e:
                        Print("cjson.DecodeError:", e.message)

                if 'difference' in locals():  # FIXME *SOB*
                    # Grab actual programme start time from DVB bridge channel page
                    self.send([syncschedurl], "dataout")
                    while not self.dataReady("datain"):
                        self.pause()  # Add timeout ?
#                        yield 1
                    recvdata = self.recv("datain")
                    if recvdata[0] == "OK":
                        content = recvdata[1]
                    else:
                        content = None

                    if content != None:
                        try:
                            decodedcontent = cjson.decode(content)
                            if decodedcontent[0] == "OK":
                                proginfo = decodedcontent[2]['info']
                        except cjson.DecodeError, e:
                            Print("cjson.DecodeError:", e.message)

                # Grab BBC schedule data for given channel
                self.send([scheduleurl], "dataout")
                while not self.dataReady("datain"):
                    self.pause()  # FIXME Add timeout?
#                    yield 1
                recvdata = self.recv("datain")
                if recvdata[0] == "OK":
                    content = recvdata[1]
                else:
                    content = None

                # Read and decode schedule
                if content != None:
                    try:
                        decodedcontent = cjson.decode(content)
                    except cjson.DecodeError, e:
                        Print("cjson.DecodeError:", e.message)

                    if 'proginfo' in locals():
                        showdate = proginfo['NOW']['startdate']
                        showtime = proginfo['NOW']['starttime']
                        actualstart = proginfo['changed']
                        showdatetime = datetime.strptime(
                            str(showdate[0]) + "-" + str(showdate[1]) + "-" +
                            str(showdate[2]) + " " + str(showtime[0]) + ":" +
                            str(showtime[1]) + ":" + str(showtime[2]),
                            "%Y-%m-%d %H:%M:%S")

                        # SyncTV (DVB Bridge) produced data - let's trust that
                        if 'decodedcontent' in locals():
                            for programme in decodedcontent['schedule']['day'][
                                    'broadcasts']:
                                starttime = parse(programme['start'])
                                gmt = pytz.timezone("GMT")
                                starttime = starttime.astimezone(gmt)
                                starttime = starttime.replace(tzinfo=None)
                                # Attempt to identify which DVB bridge programme corresponds to the /programmes schedule to get PID
                                if showdatetime == starttime or (
                                        showdatetime + timedelta(minutes=1)
                                        == starttime and string.lower(
                                            proginfo['NOW']['name'])
                                        == string.lower(
                                            programme['programme']
                                            ['display_titles']['title'])
                                ) or (showdatetime - timedelta(minutes=1)
                                      == starttime
                                      and string.lower(proginfo['NOW']['name'])
                                      == string.lower(
                                          programme['programme']
                                          ['display_titles']['title'])):
                                    duration = (
                                        proginfo['NOW']['duration'][0] * 60 *
                                        60
                                    ) + (proginfo['NOW']['duration'][1] *
                                         60) + proginfo['NOW']['duration'][2]
                                    progdate = parse(programme['start'])
                                    tz = progdate.tzinfo
                                    utcoffset = datetime.strptime(
                                        str(tz.utcoffset(progdate)),
                                        "%H:%M:%S")
                                    utcoffset = utcoffset.hour * 60 * 60
                                    # Something's not right with the code below #TODO #FIXME
                                    timestamp = time.mktime(
                                        showdatetime.timetuple()) + utcoffset
                                    if 'difference' in locals():
                                        offset = (timestamp -
                                                  actualstart) - difference
                                    else:
                                        offset = timestamp - actualstart
                                    pid = programme['programme']['pid']
                                    title = programme['programme'][
                                        'display_titles']['title']
                                    # Fix for unicode errors caused by some /programmes titles
                                    if (not isinstance(title, str)) and (
                                            not isinstance(title, unicode)):
                                        title = str(title)
                                    Print(pid, title, offset, duration,
                                          showdatetime, "GMT", utcoffset)
                                    data = [
                                        pid, title, offset, duration,
                                        timestamp, utcoffset
                                    ]

                    else:
                        # Couldn't use the DVB Bridge, so work out what's on NOW here
                        utcdatetime = datetime.now()

                        # Analyse schedule
                        if 'decodedcontent' in locals():
                            for programme in decodedcontent['schedule']['day'][
                                    'broadcasts']:
                                starttime = parse(programme['start'])
                                starttime = starttime.replace(tzinfo=None)
                                endtime = parse(programme['end'])
                                endtime = endtime.replace(tzinfo=None)
                                if (utcdatetime >= starttime) & (utcdatetime <
                                                                 endtime):
                                    pid = programme['programme']['pid']
                                    title = programme['programme'][
                                        'display_titles']['title']
                                    # Fix for unicode errors caused by some /programmes titles
                                    if (not isinstance(title, str)) and (
                                            not isinstance(title, unicode)):
                                        title = str(title)
                                    # Has to assume no offset between scheduled and actual programme start time as it knows no better because of the lack of DVB bridge
                                    progdate = parse(programme['start'])
                                    tz = progdate.tzinfo
                                    utcoffset = datetime.strptime(
                                        str(tz.utcoffset(progdate)),
                                        "%H:%M:%S")
                                    utcoffset = utcoffset.hour * 60 * 60
                                    timestamp = time.mktime(
                                        progdate.timetuple()) - utcoffset
                                    Print(pid, title, 0, programme['duration'],
                                          programme['start'], utcoffset)
                                    data = [
                                        pid, title, 0, programme['duration'],
                                        timestamp, utcoffset
                                    ]
Beispiel #16
0
class PeopleSearch(component):
    Inboxes = {
        "inbox": "Receives string indicating a person's name",
        "control": ""
    }
    Outboxes = {
        "outbox":
        "Outputs raw search output from Twitter people search in JSON",
        "signal": ""
    }

    def __init__(self, consumerkeypair, keypair, proxy=False):
        super(PeopleSearch, self).__init__()
        self.proxy = proxy
        self.consumerkeypair = consumerkeypair
        self.keypair = keypair
        self.ratelimited = datetime.today() - timedelta(minutes=15)

    def finished(self):
        while self.dataReady("control"):
            msg = self.recv("control")
            if isinstance(msg, producerFinished) or isinstance(
                    msg, shutdownMicroprocess):
                self.send(msg, "signal")
                return True
        return False

    def main(self):
        twitterurl = "http://api.twitter.com/1/users/search.json"

        if self.proxy:
            proxyhandler = urllib2.ProxyHandler({"http": self.proxy})
            twitopener = urllib2.build_opener(proxyhandler)
            urllib2.install_opener(twitopener)

        headers = {'User-Agent': "BBC R&D Grabber"}
        postdata = None

        if self.keypair == False:
            # Perform OAuth authentication - as we don't have the secret key pair we need to request it
            # This will require some user input
            request_token_url = 'http://api.twitter.com/oauth/request_token'
            access_token_url = 'http://api.twitter.com/oauth/access_token'
            authorize_url = 'http://api.twitter.com/oauth/authorize'

            token = None
            consumer = oauth.Consumer(key=self.consumerkeypair[0],
                                      secret=self.consumerkeypair[1])

            params = {
                'oauth_version': "1.0",
                'oauth_nonce': oauth.generate_nonce(),
                'oauth_timestamp': int(time.time()),
            }

            params['oauth_consumer_key'] = consumer.key

            req = oauth.Request(method="GET",
                                url=request_token_url,
                                parameters=params)

            signature_method = oauth.SignatureMethod_HMAC_SHA1()
            req.sign_request(signature_method, consumer, token)

            requestheaders = req.to_header()
            requestheaders['User-Agent'] = "BBC R&D Grabber"

            # Connect to Twitter
            try:
                req = urllib2.Request(
                    request_token_url, None, requestheaders
                )  # Why won't this work?!? Is it trying to POST?
                conn1 = urllib2.urlopen(req)
            except httplib.BadStatusLine, e:
                Print("PeopleSearch BadStatusLine error:", e)
                conn1 = False
            except urllib2.HTTPError, e:
                Print("PeopleSearch HTTP error:", e.code)
                #                sys.stderr.write('PeopleSearch HTTP error: ' + str(e.code) + '\n')
                conn1 = False
            except urllib2.URLError, e:
                Print("PeopleSearch URL error: ", e.reason)
                #                sys.stderr.write('PeopleSearch URL error: ' + str(e.reason) + '\n')
                conn1 = False
Beispiel #17
0
    def main(self):
        self.dbConnect()
        while not self.finished():
            twitdata = list()
            # Collect all current received tweet JSON and their related PIDs into a twitdata list
            while self.dataReady("inbox"):
                pids = list()
                data = self.recv("inbox")
                for pid in data[1]:
                    pids.append(pid)
                twitdata.append([data[0], pids])
            if len(twitdata) > 0:
                # Process the received twitdata
                for tweet in twitdata:
                    tweet[0] = tweet[0].replace(
                        "\\/", "/"
                    )  # Fix slashes in links: This may need moving further down the line - ideally it would be handled by cjson
                    if tweet[0] != "\r\n":  # If \r\n is received, this is just a keep alive signal from Twitter every 30 secs
                        # At this point, each 'tweet' contains tweetdata, and a list of possible pids
                        newdata = cjson.decode(tweet[0])
                        if newdata.has_key('delete') or newdata.has_key(
                                'scrub_geo') or newdata.has_key('limit'):
                            # Keep a record of all requests from Twitter for deletions, location removal etc
                            # As yet none of these have been received, but this code will store them if they are received to enable debugging
                            filepath = "contentDebug.txt"
                            if os.path.exists(filepath):
                                file = open(filepath, 'r')
                                filecontents = file.read()
                            else:
                                filecontents = ""
                            file = open(filepath, 'w')
                            file.write(filecontents + "\n" +
                                       str(datetime.utcnow()) + " " +
                                       cjson.encode(newdata))
                            file.close()
                        else:
                            # This is a real tweet
                            tweetid = newdata['id']
                            try:
                                Print("New tweet! @",
                                      repr(newdata['user']['screen_name']),
                                      ": " + repr(newdata['text']))
                            except UnicodeEncodeError, e:
                                Print("Unicode error suppressed", e)

                            for pid in tweet[1]:
                                # Cycle through possible pids, grabbing that pid's keywords from the DB
                                # Then, check this tweet against the keywords and save to DB where appropriate (there may be more than one location)
                                self.db_select(
                                    """SELECT keyword,type FROM keywords WHERE pid = %s""",
                                    (pid))
                                data = self.db_fetchall()
                                for row in data:
                                    # Some keywords are stored with a ^. These must be split, and the tweet checked to see if it has both keywords, but not necessarily next to each other
                                    keywords = row[0].split("^")
                                    if len(keywords) == 2:
                                        if string.lower(
                                                keywords[0]) in string.lower(
                                                    newdata['text']
                                                ) and string.lower(
                                                    keywords[1]
                                                ) in string.lower(
                                                    newdata['text']):
                                            self.db_select(
                                                """SELECT timestamp,timediff FROM programmes WHERE pid = %s ORDER BY timestamp DESC""",
                                                (pid))
                                            progdata = self.db_fetchone()
                                            if progdata != None:
                                                # Ensure the user hasn't already tweeted the same text
                                                # Also ensure they haven't tweeted in the past 10 seconds
                                                timestamp = time.mktime(
                                                    parse(newdata['created_at']
                                                          ).timetuple())
                                                self.db_select(
                                                    """SELECT * FROM rawdata WHERE (pid = %s AND text = %s AND user = %s) OR (pid = %s AND user = %s AND timestamp >= %s AND timestamp < %s)""",
                                                    (pid, newdata['text'],
                                                     newdata['user']
                                                     ['screen_name'], pid,
                                                     newdata['user']
                                                     ['screen_name'],
                                                     timestamp - 10,
                                                     timestamp + 10))
                                                if self.db_fetchone() == None:
                                                    Print(
                                                        "Storing tweet for pid ",
                                                        pid)
                                                    # Work out where this tweet really occurred in the programme using timestamps and DVB bridge data
                                                    progposition = timestamp - (
                                                        progdata[0] -
                                                        progdata[1])
                                                    self.db_insert(
                                                        """INSERT INTO rawdata (tweet_id,pid,timestamp,text,user,programme_position) VALUES (%s,%s,%s,%s,%s,%s)""",
                                                        (tweetid, pid,
                                                         timestamp,
                                                         newdata['text'],
                                                         newdata['user']
                                                         ['screen_name'],
                                                         progposition))
                                                    break  # Break out of this loop and back to check the same tweet against the next programme
                                                else:
                                                    Print(
                                                        "Duplicate tweet from user - ignoring"
                                                    )
                                    if string.lower(row[0]) in string.lower(
                                            newdata['text']):
                                        self.db_select(
                                            """SELECT timestamp,timediff FROM programmes WHERE pid = %s ORDER BY timestamp DESC""",
                                            (pid))
                                        progdata = self.db_fetchone()
                                        if progdata != None:
                                            # Ensure the user hasn't already tweeted the same text for this programme
                                            # Also ensure they haven't tweeted in the past 10 seconds
                                            timestamp = time.mktime(
                                                parse(newdata['created_at']).
                                                timetuple())
                                            self.db_select(
                                                """SELECT * FROM rawdata WHERE (pid = %s AND text = %s AND user = %s) OR (pid = %s AND user = %s AND timestamp >= %s AND timestamp < %s)""",
                                                (pid, newdata['text'],
                                                 newdata['user']
                                                 ['screen_name'], pid,
                                                 newdata['user']
                                                 ['screen_name'], timestamp -
                                                 10, timestamp + 10))
                                            if self.db_fetchone() == None:
                                                Print("Storing tweet for pid ",
                                                      pid)
                                                # Work out where this tweet really occurred in the programme using timestamps and DVB bridge data
                                                progposition = timestamp - (
                                                    progdata[0] - progdata[1])
                                                self.db_insert(
                                                    """INSERT INTO rawdata (tweet_id,pid,timestamp,text,user,programme_position) VALUES (%s,%s,%s,%s,%s,%s)""",
                                                    (tweetid, pid, timestamp,
                                                     newdata['text'],
                                                     newdata['user']
                                                     ['screen_name'],
                                                     progposition))
                                                break  # Break out of this loop and back to check the same tweet against the next programme
                                            else:
                                                Print(
                                                    "Duplicate tweet from user - ignoring"
                                                )
                    else:
                        Print("Blank line received from Twitter - no new data")

                    Print("Done!")  # new line to break up display
            else:
                time.sleep(0.1)
Beispiel #18
0
# It will also create files called namecache.conf, linkcache.conf and oversizedtweets.conf in your home directory
# See the README for more information

# Before we do anything.
# First check to see if we're suppose to be even running. If we're not, don't start!



import os
import sys
from Kamaelia.Apps.SocialBookmarks.Print import Print

# Before we do anything.
# First check to see if we're suppose to be even running. If we're not, don't start!
if os.path.exists(os.path.join(os.path.expanduser("~"), "stop_bookmarks")):
    Print("Exitting bookmarks because ~/stop_bookmarks exists")
    start = False
    sys.exit(0)
else:
    start = True

# import Axon
# Axon.Box.ShowAllTransits = True


if start and (__name__ == "__main__"):

    import cjson

    from Kamaelia.Apps.SocialBookmarks.BBCProgrammes import WhatsOn
    from Kamaelia.Apps.SocialBookmarks.DataCollector import DataCollector, RawDataCollector
    def main(self):
        # Print( "With component starting...")
        self.addChildren(self.item)
        self.item.activate()

        try:
            dontcontinue = False
            for graphstep in self.sequence:
                # Print( "Next/this graphstep :", graphstep)
                stopping = 0
                if dontcontinue:
                    break

                links = self.link_graphstep(graphstep)
                while True:
                    # Let sub graphstep run, and wait for completion. Sleep as much as possible.
                    if not self.anyReady():
                        self.pause()
                        yield 1

                    self.checkControl()                            # Told by the outside world to shutdown
                    dontcontinue = self.handleGraphstepShutdown()  # Components inside have shutdown..

                    if self.anyStopped():
    #                    print "Something stopped"
                        all_stopped = True # Assume
                        if self.item._isStopped():
                            Print( "Warning: Child died before completion", self.item )
                            self.shutdownChildComponents(shutdownMicroprocess())
                            dontcontinue = True

                        for child in self.childComponents():
                            # Check assumption
                            if child == self.item:
                                continue
                        
    #                        print "child stopped ?", child._isStopped(), child
                            all_stopped = all_stopped and child._isStopped()

                        if all_stopped:                        
                            break
                        else:
                            stopping += 1
                            if (stopping % 1000) == 0:
                                pass
                                # print "Warning one child exited, but others haven't after", stopping, "loops"

                    yield 1

                    if dontcontinue:
                        break

                for link in links: 
                    self.unlink(thelinkage=link)

    #        print "Exiting With Component... , all_stopped, dontcontinue:", all_stopped, dontcontinue
            self.link( (self, "_signal"), (self.item, "control") )
            self.send( producerFinished(), "_signal")
        except ShutdownNow:
            # Print( "Shutting Down Now")
            self.shutdownChildComponents(self.control_message)
            # Print( "Sending shutdown to The Item")
            self.link( (self, "_signal"), (self.item, "control") )
            self.send( self.control_message, "_signal")
Beispiel #20
0
    def main(self):
        # Calculate running total and mean etc
        self.dbConnect()
        while not self.finished():
            # The below does LIVE and FINAL analysis - do NOT run DataAnalyser at the same time

            Print("Analysis component: Checking for new data...")

            # Stage 1: Live analysis - could do with a better way to do the first query (indexed field 'analsed' to speed up for now)
            # Could move this into the main app to take a copy of tweets on arrival, but would rather solve separately if poss
            self.db_select(
                """SELECT tid,pid,timestamp,text,tweet_id,programme_position FROM rawdata WHERE analysed = 0 ORDER BY tid LIMIT 5000"""
            )
            data = self.db_fetchall()

            # Cycle through all the as yet unanalysed tweets
            for result in data:
                tid = result[0]
                pid = result[1]
                tweettime = result[
                    2]  # Timestamp based on the tweet's created_at field
                tweettext = result[3]
                tweetid = result[
                    4]  # This is the real tweet ID, tid just makes a unique identifier as each tweet can be stored against several pids
                progpos = result[
                    5]  # Position through the programme that the tweet was made
                dbtime = datetime.utcfromtimestamp(tweettime)
                # Each tweet will be grouped into chunks of one minute to make display better, so set the seconds to zero
                # This particular time is only used for console display now as a more accurate one calculated from programme position is found later
                dbtime = dbtime.replace(second=0)
                Print("Analysis component: Analysing new tweet for pid", pid,
                      "(", dbtime, "):")
                try:
                    Print("Analysis component: '", tweettext, "'")
                except UnicodeEncodeError, e:
                    Print("UnicodeEncodeError", e)
                self.db_select(
                    """SELECT duration FROM programmes_unique WHERE pid = %s""",
                    (pid))
                progdata = self.db_fetchone()
                duration = progdata[0]
                self.db_select(
                    """SELECT totaltweets,meantweets,mediantweets,modetweets,stdevtweets,timediff,timestamp,utcoffset FROM programmes WHERE pid = %s ORDER BY timestamp DESC""",
                    (pid))
                progdata2 = self.db_fetchone()
                totaltweets = progdata2[0]
                # Increment the total tweets recorded for this programme's broadcast
                totaltweets += 1
                meantweets = progdata2[1]
                mediantweets = progdata2[2]
                modetweets = progdata2[3]
                stdevtweets = progdata2[4]
                timediff = progdata2[5]
                timestamp = progdata2[6]
                utcoffset = progdata2[7]

                # Need to work out the timestamp to assign to the entry in analysed data
                progstart = timestamp - timediff
                progmins = int(progpos / 60)
                analysedstamp = int(progstart + (progmins * 60))
                # Ensure that this tweet occurs within the length of the programme, otherwise for the purposes of this program it's useless

                if progpos > 0 and progpos <= duration:
                    self.db_select(
                        """SELECT did,totaltweets,wordfreqexpected,wordfrequnexpected FROM analyseddata WHERE pid = %s AND timestamp = %s""",
                        (pid, analysedstamp))
                    analyseddata = self.db_fetchone()
                    # Just in case of a missing raw json object (ie. programme terminated before it was stored - allow it to be skipped if not found after 30 secs)
                    #failcounter = 0
                    # Pass this tweet to the NLTK analysis component
                    self.send([pid, tweetid], "nltk")
                    #                        print "BUM", 1
                    while not self.dataReady("nltk"):
                        #    if failcounter >= 3000:
                        #        nltkdata = list()
                        #        break
                        time.sleep(0.01)
                    #    failcounter += 1
                    #if failcounter < 3000:


#                        print "BUM", 2
                    if 1:
                        # Receive back a list of words and their frequency for this tweet, including whether or not they are common, an entity etc
                        nltkdata = self.recv("nltk")
                    if analyseddata == None:  # No tweets yet recorded for this minute
                        minutetweets = 1
                        self.db_insert(
                            """INSERT INTO analyseddata (pid,totaltweets,timestamp) VALUES (%s,%s,%s)""",
                            (pid, minutetweets, analysedstamp))
                        for word in nltkdata:
                            # Check if we're storing a word or phrase here
                            if nltkdata[word][0] == 1:
                                self.db_insert(
                                    """INSERT INTO wordanalysis (pid,timestamp,phrase,count,is_keyword,is_entity,is_common) VALUES (%s,%s,%s,%s,%s,%s,%s)""",
                                    (pid, analysedstamp, word,
                                     nltkdata[word][1], nltkdata[word][2],
                                     nltkdata[word][3], nltkdata[word][4]))
                            else:
                                self.db_insert(
                                    """INSERT INTO wordanalysis (pid,timestamp,word,count,is_keyword,is_entity,is_common) VALUES (%s,%s,%s,%s,%s,%s,%s)""",
                                    (pid, analysedstamp, word,
                                     nltkdata[word][1], nltkdata[word][2],
                                     nltkdata[word][3], nltkdata[word][4]))
                    else:
                        did = analyseddata[0]
                        minutetweets = analyseddata[
                            1]  # Get current number of tweets for this minute
                        minutetweets += 1  # Add one to it for this tweet

                        self.db_update(
                            """UPDATE analyseddata SET totaltweets = %s WHERE did = %s""",
                            (minutetweets, did))

                        for word in nltkdata:
                            # Check if we're storing a word or phrase
                            if nltkdata[word][0] == 1:
                                self.db_select(
                                    """SELECT wid,count FROM wordanalysis WHERE pid = %s AND timestamp = %s AND phrase LIKE %s""",
                                    (pid, analysedstamp, word))
                                # Check if this phrase has already been stored for this minute - if so, increment the count
                                wordcheck = self.db_fetchone()
                                if wordcheck == None:
                                    self.db_insert(
                                        """INSERT INTO wordanalysis (pid,timestamp,phrase,count,is_keyword,is_entity,is_common) VALUES (%s,%s,%s,%s,%s,%s,%s)""",
                                        (pid, analysedstamp, word,
                                         nltkdata[word][1], nltkdata[word][2],
                                         nltkdata[word][3], nltkdata[word][4]))
                                else:
                                    self.db_update(
                                        """UPDATE wordanalysis SET count = %s WHERE wid = %s""",
                                        (nltkdata[word][1] + wordcheck[1],
                                         wordcheck[0]))
                            else:
                                self.db_select(
                                    """SELECT wid,count FROM wordanalysis WHERE pid = %s AND timestamp = %s AND word LIKE %s""",
                                    (pid, analysedstamp, word))
                                # Check if this word has already been stored for this minute - if so, increment the count
                                wordcheck = self.db_fetchone()
                                if wordcheck == None:
                                    self.db_insert(
                                        """INSERT INTO wordanalysis (pid,timestamp,word,count,is_keyword,is_entity,is_common) VALUES (%s,%s,%s,%s,%s,%s,%s)""",
                                        (pid, analysedstamp, word,
                                         nltkdata[word][1], nltkdata[word][2],
                                         nltkdata[word][3], nltkdata[word][4]))
                                else:
                                    self.db_update(
                                        """UPDATE wordanalysis SET count = %s WHERE wid = %s""",
                                        (nltkdata[word][1] + wordcheck[1],
                                         wordcheck[0]))
                    # Averages / stdev are calculated roughly based on the programme's running time at this point
                    progdate = datetime.utcfromtimestamp(
                        timestamp) + timedelta(seconds=utcoffset)
                    actualstart = progdate - timedelta(seconds=timediff)
                    actualtweettime = datetime.utcfromtimestamp(tweettime +
                                                                utcoffset)

                    # Calculate how far through the programme this tweet occurred
                    runningtime = actualtweettime - actualstart
                    runningtime = runningtime.seconds

                    if runningtime < 0:
                        runningtime = 0
                    else:
                        runningtime = float(runningtime) / 60

                    try:
                        meantweets = totaltweets / runningtime
                    except ZeroDivisionError, e:
                        meantweets = 0

                    self.db_select(
                        """SELECT totaltweets FROM analyseddata WHERE pid = %s AND timestamp >= %s AND timestamp < %s""",
                        (pid, progstart, analysedstamp + duration))
                    analyseddata = self.db_fetchall()

                    runningtime = int(runningtime)

                    tweetlist = list()
                    for result in analyseddata:
                        totaltweetsmin = result[0]
                        # Create a list of each minute and the total tweets for that minute in the programme
                        tweetlist.append(int(totaltweetsmin))

                    # Ensure tweetlist has enough entries
                    # If a minute has no tweets, it won't have a database record, so this has to be added
                    if len(tweetlist) < runningtime:
                        additions = runningtime - len(tweetlist)
                        while additions > 0:
                            tweetlist.append(0)
                            additions -= 1

                    # Order by programme position 0,1,2, mins etc
                    tweetlist.sort()

                    mediantweets = tweetlist[int(len(tweetlist) / 2)]

                    modes = dict()
                    stdevlist = list()
                    for tweet in tweetlist:
                        modes[tweet] = tweetlist.count(tweet)
                        stdevlist.append(
                            (tweet - meantweets) * (tweet - meantweets))

                    modeitems = [[v, k] for k, v in modes.items()]
                    modeitems.sort(reverse=True)
                    modetweets = int(modeitems[0][1])

                    stdevtweets = 0
                    for val in stdevlist:
                        stdevtweets += val

                    try:
                        stdevtweets = math.sqrt(stdevtweets / runningtime)
                    except ZeroDivisionError, e:
                        stdevtweets = 0

                    # Finished analysis - update DB
                    self.db_update(
                        """UPDATE programmes SET totaltweets = %s, meantweets = %s, mediantweets = %s, modetweets = %s, stdevtweets = %s WHERE pid = %s AND timestamp = %s""",
                        (totaltweets, meantweets, mediantweets, modetweets,
                         stdevtweets, pid, timestamp))
Beispiel #21
0
    def main(self):
        # Calculate running total and mean etc

        self.dbConnect()
        Print("FinalAnalysisNLTK: Initialised")
        while not self.finished():
            Print("FinalAnalysisNLTK: Waiting for data")
            if self.dataReady("inbox"):
                data = self.recv("inbox")
                Print("FinalAnalysisNLTK: got data")
                Print(
                    "FinalAnalysisNLTK: ... and ignoring it (Since didn't store results of final analysis anyway..."
                )

                if False:
                    pid = data[0]
                    tweetids = data[1]

                    retweetcache = dict()

                    # Issue #TODO - Words that appear as part of a keyword but not the whole thing won't get marked as being a keyword (e.g. Blue Peter - two diff words)
                    # Need to check for each word if it forms part of a phrase which is also a keyword
                    # If so, don't count is as a word, count the whole thing as a phrase and remember not to count it more than once
                    # May actually store phrases AS WELL AS keywords

                    keywords = dict()
                    # Find keywords for this PID
                    self.db_select(
                        """SELECT keyword,type FROM keywords WHERE pid = %s""",
                        (pid))
                    keyworddata = self.db_fetchall()
                    Print("FinalAnalysisNLTK: len(keyworddata)",
                          len(keyworddata))
                    for word in keyworddata:
                        wordname = word[0].lower()
                        if "^" in wordname:
                            wordbits = wordname.split("^")
                            wordname = wordbits[0]
                        wordbits = wordname.split()
                        # Only looking at phrases here (more than one word)
                        if len(wordbits) > 1:
                            keywords[wordname] = word[1]

                    filteredtext = list()

                    Print(
                        "FinalAnalysisNLTK: about to loop through tweet ids - count -",
                        len(tweetids))

                    for tweetid in tweetids:
                        # Cycle through each tweet and find its JSON
                        tweetdata = None
                        Print("FinalAnalysisNLTK: getting tweet data",
                              len(tweetids))
                        tweetdatafailcount = 0
                        tweetfixfailcount = 0
                        failed_tweet = False
                        while tweetdata == None:
                            Print("FinalAnalysisNLTK: Trying to get tweetdata")
                            self.db_select(
                                """SELECT tweet_json FROM rawtweets WHERE tweet_id = %s""",
                                (tweetid))
                            tweetdata = self.db_fetchone()
                            if tweetdata == None:
                                tweetdatafailcount += 1
                                Print(
                                    "FinalAnalysisNLTK: failed to tweetdata - count, id:",
                                    tweetdatafailcount, tweetid)
                                if tweetdatafailcount > 1000:
                                    failed_tweet = True
                                    # FIXME: Without the following break, this goes into a busy wait loop, which locks
                                    # FIXME: up the scheduler because this component does not yield command back to the caller
                                    # FIXME: This results in starvation of the system because it does not allow any other
                                    # FIXME: process/component to progress. Since this has happened remarkably rarely, this
                                    # FIXME: means the collection & storage has been remarkably reliable.
                                    # FIXME: Should be able to actually fix this, and the mystery hangs now too.
                                    # FIXME: This "break" is actually sufficient in the short term, but means there's a hang
                                    # FIXME: of 1000 cycles -- if this entire block is uncommented.
                                    # FIXME: Since this problem happens rarely, bt catastrophically, fixing this really matters.
                                    # FIXME: Means it's especially surprising that *any* data was collected during this period,
                                    # FIXME: since this would cause the entire system - except other threads - to hang.
                                    # FIXME: Crucially, this is also something the zombie killer can't kill, but can (and did)
                                    # FIXME: draw attention to.
                                    break
                            else:
                                Print("FinalAnalysisNLTK: got tweetdata")
                                tweetjson = cjson.decode(tweetdata[0])

                                self.send(tweetjson, "tweetfixer")
                                twnc_count = 0
                                while not self.dataReady("tweetfixer"):
                                    if twnc_count > 10:
                                        # Empirically, twnc_count gets there within 2 or 3 loops
                                        break
                                    twnc_count += 1
                                    self.pause()
                                    yield 1

                                if not self.dataReady("tweetfixer"):
                                    tweetfixfailcount += 1
                                    Print(
                                        "FinalAnalysisNLTK: Tweet Fixer Failed - twnc_count, tweetfixfailcount",
                                        twnc_count, tweetfixfailcount, tweetid)
                                else:
                                    tweetjson = self.recv("tweetfixer")

                                    # Identify retweets
                                    if tweetjson.has_key('retweeted_status'):
                                        if tweetjson[
                                                'retweeted_status'].has_key(
                                                    'id'):
                                            statusid = tweetjson[
                                                'retweeted_status']['id']
                                            if retweetcache.has_key(statusid):
                                                retweetcache[statusid][0] += 1
                                            else:
                                                retweetcache[statusid] = [
                                                    1, tweetjson[
                                                        'retweeted_status']
                                                    ['text']
                                                ]

                                    tweettext = self.spellingFixer(
                                        tweetjson['filtered_text']).split()

                                    for word in tweettext:
                                        if word[0] in """!"#$%&()*+,-./:;<=>?@~[\\]?_'`{|}?""" and not (
                                                len(word) <= 3 and
                                            (word[0] == ":"
                                             or word[0] == ";")):
                                            word = word[1:]
                                        if word != "":
                                            # Done twice to capture things like 'this is a "quote".'
                                            if len(word) >= 2:
                                                if word[len(
                                                        word
                                                ) - 1] in """!"#$%&()*+,-./:;<=>?@~[\\]?_'`{|}?""" and word[
                                                        len(word) - 2:len(
                                                            word
                                                        )] != "s'" and not (
                                                            len(word) <= 3 and
                                                            (word[0] == ":" or
                                                             word[0] == ";")):
                                                    word = word[:len(word) - 1]
                                                    if word[len(
                                                            word
                                                    ) - 1] in """!"#$%&()*+,-./:;<=>?@~[\\]?_'`{|}?""" and word[
                                                            len(word) - 2:len(
                                                                word
                                                            )] != "s'" and not (
                                                                len(word) <= 3
                                                                and
                                                                (word[0] == ":"
                                                                 or word[0]
                                                                 == ";")):
                                                        word = word[:len(word
                                                                         ) - 1]
                                            elif word[len(
                                                    word
                                            ) - 1] in """!"#$%&()*+,-./:;<=>?@~[\\]?_'`{|}?""" and not (
                                                    len(word) <= 3 and
                                                (word[0] == ":"
                                                 or word[0] == ";")):
                                                word = word[:len(word) - 1]
                                                if word != "":
                                                    if word[len(
                                                            word
                                                    ) - 1] in """!"#$%&()*+,-./:;<=>?@~[\\]?_'`{|}?""" and not (
                                                            len(word) <= 3 and
                                                        (word[0] == ":"
                                                         or word[0] == ";")):
                                                        word = word[:len(word
                                                                         ) - 1]
                                        if word != "":
                                            if word in """!"#$%&()*+,-./:;<=>?@~[\\]?_'`{|}?""":
                                                word = ""

                                        if word != "":
                                            filteredtext.append(word)

                    # Format: {"word" : [is_phrase,count,is_keyword,is_entity,is_common]}
                    # Need to change this for retweets as they should include all the text content if truncated - need some clever merging FIXME TODO
                    wordfreqdata = dict()

                    # Look for phrases - very limited
                    bigram_fd = FreqDist(nltk.bigrams(filteredtext))

                    # Print(bigram_fd)
                    for entry in bigram_fd:
                        if entry[0] not in """!"#$%&()*+,-./:;<=>?@~[\\]?_'`{|}?""" and entry[
                                1] not in """!"#$%&()*+,-./:;<=>?@~[\\]?_'`{|}?""":
                            if entry[0] not in self.exclusions and entry[
                                    1] not in self.exclusions:
                                for word in keywords:
                                    # Print(word)
                                    if entry[0] in word and entry[1] in word:
                                        # Print("Keyword Match! " , entry[0],entry[1] )
                                        break
                                else:
                                    pass
                                    #Print( entry[0],entry[1])

                    # Print("Retweet data: " , retweetcache)

                self.send(
                    None, "outbox"
                )  ## FIXME: AAAAAAAAAAAAAAAAAAAARRRRRRRRRRRRRRRRRRRRRRGGGGGGGGGGGGGGGGGGGGGGHHHHHHHHHHHHHHHHHHHHHHH

            if not self.anyReady():
                Print("FinalAnalysisNLTK: about to pause")
                self.pause()
            yield 1
Beispiel #22
0
 def main(self):
     while True:
         time.sleep(1)
         Print("!", self.name)
Beispiel #23
0
    def main(self):
        # Print("Entering main of the TwitterStream component", self)
        self.url = "https://stream.twitter.com/1/statuses/filter.json"

        self.headers = {
            "Accept-Encoding": "identity",
            "Keep-Alive": self.timeout,
            "Connection": "close",
            "User-Agent": "BBC R&D Grabber",
            "Content-Type": "application/x-www-form-urlencoded"
        }

        self.datacapture = None
        counter = 0
        blanklinecount = 0

        # Print("Entering main loop", self)
        while not self.finished():
            if self.dataReady("inbox"):
                # Print("New data on inbox", self)
                if self.datacapture != None:
                    # Print("We have a datacapture component, so need it to shutdown - call it's .stop() method ... (hmm, not correct really and would work with graphline...)", self)

                    L = self.link((self, "_signal"),
                                  (self.datacapture, "control"))
                    self.send(producerFinished(), "_signal")

                    self.unlink(self.datacapture
                                )  # Unlinks all linkages relating to this...
                    #                    self.datacapture.stop()
                    self.datacapture = None
                    # Print("We now believe the subcomponent is dead. Probably erroneously...", self)
                recvdata = self.recv("inbox")
                keywords = recvdata[0]
                if len(keywords) > 400:
                    keywords = keywords[0:400:1]

                pids = recvdata[1]

                safe_keywords = [
                    x.encode("utf8") for x in keywords
                ]  # Needed to preclude unicode encoding issues in urllib...
                args = urllib.urlencode({"track": ",".join(safe_keywords)})
                # Print ("Got keywords:", args)

                # Print("Create new datacapture component", self)
                self.connect(args, pids)
                # Print("Created...", self)

            while self.dataReady("tweetsin"):
                counter = 0
                tweetdata = self.recv("tweetsin")
                if tweetdata[0] == "\r\n":
                    blanklinecount += 1
                else:
                    blanklinecount = 0
                self.send(tweetdata, "outbox")
                if self.dataReady("inbox"):
                    break
            if not self.dataReady("tweetsin"):
                time.sleep(1)
                if self.datacapture != None:
                    counter += 1
                else:
                    counter = 0
                # This still isn't great at reducing busy wait CPU usage
                # Blank line count ensures we reconnect if we get 10 successive keepalives with no data - likely an error
            if (counter > self.timeout and self.datacapture != None
                    and self.reconnect) or (blanklinecount >= 10
                                            and self.reconnect):
                Print("counter", counter)
                Print("self.timeout", self.timeout)
                Print("self.datacapture", self.datacapture)
                Print("self.datacapture", self.datacapture.components)
                Print("self.reconnect", self.reconnect)
                Print("blanklinecount", blanklinecount)
                blanklinecount = 0
                sys.stderr.write("API Connection Failed: Reconnecting")

                #                import os
                #                os.system("/home/michaels/Checkouts/kamaelia/trunk/Code/Python/Apps/SocialBookmarks/App/LastDitch.sh")
                #
                #                sys.exit(0) # FIXME Brutal, but effective
                #                self.scheduler.stop() # FIXME Brutal, but effective
                L = self.link((self, "_signal"), (self.datacapture, "control"))
                self.send(producerFinished(), "_signal")

                self.unlink(self.datacapture)
                self.datacapture = None
                # Twitter connection has failed
                counter = 0
                self.connect(args, pids)
Beispiel #24
0
    def doStuff(self, channel):
        # Check what's on for each channel
        self.send(channel, "whatson")
        while not self.dataReady("whatson"):
            pass
        data = self.recv("whatson")
        if data == None:
            pid = None
        else:
            pid = data[0]
            title = data[1]
            offset = data[2]
            duration = data[3]
            expectedstart = data[4]
        if pid != self.channels[channel]:
            # Perhaps just do a duplicate scan before creating Twitter stream
            if pid == None:
                self.channels[channel] = None
                Print (channel, ": Off Air")
            else:
                self.channels[channel] = pid
                self.send(["http://www.bbc.co.uk/programmes/" + pid + ".rdf"], "dataout")
                while not self.dataReady("datain"):
                    pass
                recvdata = self.recv("datain")
                
                if recvdata[0] == "OK":
                    programmedata = recvdata[1]
                else:
                    # Fake programme data to prevent crash - not ideal
                    programmedata = '<?xml version="1.0" encoding="utf-8"?> \
                                    <rdf:RDF xmlns:rdf      = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" \
                                             xmlns:rdfs     = "http://www.w3.org/2000/01/rdf-schema#" \
                                             xmlns:owl      = "http://www.w3.org/2002/07/owl#" \
                                             xmlns:foaf     = "http://xmlns.com/foaf/0.1/" \
                                             xmlns:po       = "http://purl.org/ontology/po/" \
                                             xmlns:mo       = "http://purl.org/ontology/mo/" \
                                             xmlns:skos     = "http://www.w3.org/2008/05/skos#" \
                                             xmlns:time     = "http://www.w3.org/2006/time#" \
                                             xmlns:dc       = "http://purl.org/dc/elements/1.1/" \
                                             xmlns:dcterms  = "http://purl.org/dc/terms/" \
                                             xmlns:wgs84_pos= "http://www.w3.org/2003/01/geo/wgs84_pos#" \
                                             xmlns:timeline = "http://purl.org/NET/c4dm/timeline.owl#" \
                                             xmlns:event    = "http://purl.org/NET/c4dm/event.owl#"> \
                                    </rdf:RDF>'

                # RDF reader needs to read from a file so write out first
                # Alternative is to read from a URL, but this lacks proper proxy support
                filepath = "tempRDF.txt"
                file = open(filepath, 'w')
                file.write(programmedata)
                file.close()

                g = Graph()
                # This is a temporary proxy fix. A URL could be put here instead
                g.parse("tempRDF.txt")

                # Identify the brand and whether there are any official hashtags
                twittags = list()
                for bid in g.subjects(object = rdflib.URIRef('http://purl.org/ontology/po/Brand')):
                    # bid is Brand ID
                    bidmod = bid.replace("#programme","")
                    bidmod = str(bidmod.replace("file:///programmes/",""))
                    if (bidmod in self.officialbrandtags):
                        twittags = self.officialbrandtags[bidmod]
                        break

                # Identify the series and whether there are any official hashtags
                if len(twittags) == 0:
                    # Identify the brand and whether there are any official hashtags
                    for sid in g.subjects(object = rdflib.URIRef('http://purl.org/ontology/po/Series')):
                        # sid is Series ID
                        sidmod = sid.replace("#programme","")
                        sidmod = str(sidmod.replace("file:///programmes/",""))
                        if (sidmod in self.officialseriestags):
                            twittags = self.officialseriestags[sidmod]
                            break

                vidmod = ""
                so = g.subject_objects(predicate=rdflib.URIRef('http://purl.org/ontology/po/version'))
                # Pick a version, any version - for this which one doesn't matter
                for x in so:
                    # vid is version id
                    vid = x[1]
                    vidmod = vid.replace("#programme","")
                    vidmod = vidmod.replace("file:///programmes/","")
                    break

                # Got version, now get people

                self.send(["http://www.bbc.co.uk/programmes/" + vidmod + ".rdf"], "dataout")
                while not self.dataReady("datain"):
                    pass
                recvdata = self.recv("datain")
                if recvdata[0] == "OK":
                    versiondata = recvdata[1]
                else:
                    versiondata = '<?xml version="1.0" encoding="utf-8"?> \
                                    <rdf:RDF xmlns:rdf      = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" \
                                             xmlns:rdfs     = "http://www.w3.org/2000/01/rdf-schema#" \
                                             xmlns:owl      = "http://www.w3.org/2002/07/owl#" \
                                             xmlns:foaf     = "http://xmlns.com/foaf/0.1/" \
                                             xmlns:po       = "http://purl.org/ontology/po/" \
                                             xmlns:mo       = "http://purl.org/ontology/mo/" \
                                             xmlns:skos     = "http://www.w3.org/2008/05/skos#" \
                                             xmlns:time     = "http://www.w3.org/2006/time#" \
                                             xmlns:dc       = "http://purl.org/dc/elements/1.1/" \
                                             xmlns:dcterms  = "http://purl.org/dc/terms/" \
                                             xmlns:wgs84_pos= "http://www.w3.org/2003/01/geo/wgs84_pos#" \
                                             xmlns:timeline = "http://purl.org/NET/c4dm/timeline.owl#" \
                                             xmlns:event    = "http://purl.org/NET/c4dm/event.owl#"> \
                                    </rdf:RDF>'

                filepath = "tempRDF.txt"
                file = open(filepath, 'w')
                file.write(versiondata)
                file.close()

                g = Graph()
                g.parse("tempRDF.txt")

                # Identify if this is a change of programme, or the first time we've checked what's on for Print clarity
                if self.firstrun:
                    Print (channel , ": " + title)
                else:
                    Print (channel , ": Changed to - " , title)

                # Minor alterations
                title = title.replace("&","and")

                if ":" in title:
                    titlebits = title.split(":")
                    title = titlebits[0]

                # Saving a copy here so apostrophes etc can be used in the Twitter people search
                titlesave = title

                # Remove punctuation
                for item in """!"#$%()*+,-./;<=>?@[\\]?_'`{|}?""":
                    title = title.replace(item,"")

                keywords = dict()
                # Save keywords next to a descriptor of what they are
                keywords[pid] = "PID"

                # Add official hashtags to the list
                for tag in twittags:
                    keywords[tag] = "Twitter"

                # Duplicates will be removed later
                if string.find(title,"The",0,3) != -1:
                    newtitle = string.replace(re.sub("\s+","",title),"The ","",1)
                    keywords[channel] = "Channel"
                    keywords["#" + string.lower(re.sub("\s+","",title))] = "Title"
                    # Check for and remove year too
                    keywords["#" + string.replace(string.lower(re.sub("\s+","",title))," " + str(date.today().year),"",1)] = "Title"
                    keywords['#' + string.lower(re.sub("\s+","",newtitle))] = "Title"
                    # Check for and remove year too
                    keywords['#' + string.replace(string.lower(re.sub("\s+","",newtitle))," " + str(date.today().year),"",1)] = "Title"
                else:
                    keywords[channel] = "Channel"
                    keywords["#" + string.lower(re.sub("\s+","",title))] = "Title"
                    keywords["#" + string.replace(string.lower(re.sub("\s+","",title))," " + str(date.today().year),"",1)] = "Title"

                allwordtitle = string.replace(title,"The ","",1)
                allwordtitle = allwordtitle.lower()
                # Remove current year from events
                allwordtitle = allwordtitle.replace(" " + str(date.today().year),"",1)
                titlewords = allwordtitle.split()
                if len(titlewords) > 1:
                    keywords[allwordtitle] = "Title"
                else:
                    # Trial fix for issue of one word titles producing huge amounts of data
                    keywords[allwordtitle + "^" + "bbc"] = "Title"
                keywords["#" + re.sub("\s+","",allwordtitle)] = "Title"

                numwords = dict({"one" : 1, "two" : 2, "three": 3, "four" : 4, "five": 5, "six" : 6, "seven": 7})
                for word in numwords:
                    if word in channel.lower() and channel != "asiannetwork": # Bug fix! asianne2rk
                        numchannel = string.replace(channel.lower(),word,str(numwords[word]))
                        keywords[numchannel] = "Channel"
                        break
                    if str(numwords[word]) in channel.lower():
                        numchannel = string.replace(channel.lower(),str(numwords[word]),word)
                        keywords[numchannel] = "Channel"
                        break

                # Load NameCache (people we've already searched for on Twitter to avoid hammering PeopleSearch)
                save = False
                try:
                    homedir = os.path.expanduser("~")
                    file = open(homedir + "/namecache.conf",'r')
                    save = True
                except IOError:
                    e = sys.exc_info()[1]
                    Print ("Failed to load name cache - will attempt to create a new file: " ,  e)

                if save:
                    raw_config = file.read()
                    file.close()
                    try:
                        config = cjson.decode(raw_config)
                    except cjson.DecodeError:
                        e = sys.exc_info()[1]
                        config = dict()
                else:
                    config = dict()

                s = g.subjects(predicate=rdflib.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),object=rdflib.URIRef('http://purl.org/ontology/po/Role'))

                for x in s:
                    rid = g.value(predicate=rdflib.URIRef('http://purl.org/ontology/po/role'),object=rdflib.BNode(x))
                    pid = g.value(subject=rdflib.BNode(rid),predicate=rdflib.URIRef('http://purl.org/ontology/po/participant'))
                    firstname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/givenName')))
                    lastname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/familyName')))

                    if ((firstname + " " + lastname) in config):
                        # Found a cached value
                        if config[firstname + " " + lastname] != "":
                            keywords[config[firstname + " " + lastname]] = "Twitter"
                    else:
                        # Not cached yet - new request
                        self.send(firstname + " " + lastname, "search")
                        while not self.dataReady("search"):
                            pass
                        twitdata = self.recv("search")
                        screenname = ""
                        try:
                            for user in twitdata:
                                # Only use this Twitter screen name if there's a good chance they're the person we're after
                                if ("verified" in user):
                                    if (user['verified'] == True or user['followers_count'] > 10000) and string.lower(user['name']) == string.lower(firstname + " " + lastname):
                                        screenname = user['screen_name']
                                        keywords[screenname] = "Twitter"
                                        break
                        except AttributeError:
                            pass
                        config[firstname + " " + lastname] = screenname
                    keywords[firstname + " " + lastname] = "Participant"

                s = g.subjects(predicate=rdflib.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),object=rdflib.URIRef('http://purl.org/ontology/po/Character'))

                for x in s:
                    character = str(g.value(subject=rdflib.BNode(x),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/name')))
                    rid = g.value(predicate=rdflib.URIRef('http://purl.org/ontology/po/role'),object=rdflib.BNode(x))
                    pid = g.value(subject=rdflib.BNode(rid),predicate=rdflib.URIRef('http://purl.org/ontology/po/participant'))
                    firstname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/givenName')))
                    lastname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/familyName')))
                    # This ^ is a temporary fix until I work out a better DB structure
                    keywords[character + "^" + channel] = "Character"
                    keywords[character + "^" + title] = "Character"
                    if " " in character:
                        # Looks like we have a firstname + surname situation
                        charwords = character.split()
                        if charwords[0] != "Dr" and charwords[0] != "Miss" and charwords[0] != "Mr" and charwords[0] != "Mrs" and charwords[0] != "Ms" and charwords[0] != "The":
                            # As long as the first word isn't a title, add it as a first name
                            # This ^ is a temporary fix until I work out a better DB structure
                            keywords[charwords[0] + "^" + channel] = "Character"
                            keywords[charwords[0] + "^" + title] = "Character"
                        elif len(charwords) > 2:
                            # If the first word was a title, and the second word isn't a surname (checked by > 2) add the first name
                            # This ^ is a temporary fix until I work out a better DB structure
                            keywords[charwords[1] + "^" + channel] = "Character"
                            keywords[charwords[1] + "^" + title] = "Character"
                    if ((firstname + " " + lastname) in config):
                        # Found a cached value
                        if config[firstname + " " + lastname] != "":
                            keywords[config[firstname + " " + lastname]] = "Actor"
                    else:
                        # Not cached yet - new request
                        self.send(firstname + " " + lastname, "search")
                        while not self.dataReady("search"):
                            pass
                        twitdata = self.recv("search")
                        screenname = ""
                        try:
                            for user in twitdata:
                                if ("verified" in user):
                                    if (user['verified'] == True or user['followers_count'] > 10000) and string.lower(user['name']) == string.lower(firstname + " " + lastname):
                                        screenname = user['screen_name']
                                        keywords[screenname] = "Twitter"
                                        break
                        except AttributeError:
                            pass
                        config[firstname + " " + lastname] = screenname
                    keywords[firstname + " " + lastname] = "Actor"

                # Radio appears to have been forgotten about a bit in RDF / scheduling at the mo
                # So, let's do some extra queries and see if the show title is a person's name on Twitter
                if "radio" in channel or "6music" in channel or "asiannetwork" in channel or "sportsextra" in channel or "worldservice" in channel:
                    # However, radio shows are often named using the DJ - The cases where this isn't true will cause problems however as they'll be saved in json - DOH! TODO
                    if (titlesave in config):
                        # Found a cached value
                        if config[titlesave] != "":
                            keywords[config[titlesave]] = "Twitter"
                    elif len(titlesave.split()) < 4: # Prevent some shows getting through at least - restricts people's names to three words
                        self.send(titlesave, "search")
                        while not self.dataReady("search"):
                            pass
                        twitdata = self.recv("search")
                        screenname = ""
                        try:
                            for user in twitdata:
                                if ("verified" in user):
                                    if (user['verified'] == True or user['followers_count'] > 10000) and  string.lower(user['name']) == titlesave.lower():
                                        screenname = user['screen_name']
                                        keywords[screenname] = "Twitter"
                                        break
                        except AttributeError:
                            pass
                        config[titlesave] = screenname

                try:
                    file = open(homedir + "/namecache.conf",'w')
                    raw_config = cjson.encode(config)
                    file.write(raw_config)
                    file.close()
                except IOError:
                    Print ("Failed to save name cache - could cause rate limit problems")

                return [keywords,data]
            
        else:
            if pid == None:
                Print(channel , ": No change - Off Air")
            else:
                Print (channel , ": No change - " , title)
Beispiel #25
0
 def main(self):
     self.dbConnect()
     while not self.finished():
         twitdata = list()
         # As in the data collector, create a list of all tweets currently received
         while self.dataReady("inbox"):
             data = self.recv("inbox")
             twitdata.append(data[0])
         if len(twitdata) > 0:
             # Cycle through the tweets, fixing their URLs as before, and storing them if they aren't a status message
             for tweet in twitdata:
                 tweet = tweet.replace(
                     "\\/", "/"
                 )  # This may need moving further down the line - ideally it would be handled by cjson
                 if tweet != "\r\n":
                     newdata = cjson.decode(tweet)
                     if newdata.has_key('delete') or newdata.has_key(
                             'scrub_geo') or newdata.has_key('limit'):
                         # It is assumed here that the original data collector has handled the Twitter status message
                         Print(
                             "Discarding tweet instruction - captured by other component"
                         )
                     else:
                         tweetid = newdata['id']
                         # Capture exactly when this tweet was stored
                         tweetstamp = time.time()
                         tweetsecs = int(tweetstamp)
                         # Include the fractions of seconds portion of the timestamp in a separate field
                         tweetfrac = tweetstamp - tweetsecs
                         # We only have a 16000 VARCHAR field to use in MySQL (through choice) - this should be enough, but if not, the tweet will be written out to file
                         if len(tweet) < 16000:
                             try:
                                 self.db_insert(
                                     """INSERT INTO rawtweets (tweet_id,tweet_json,tweet_stored_seconds,tweet_stored_fraction) VALUES (%s,%s,%s,%s)""",
                                     (tweetid, tweet, tweetsecs, tweetfrac))
                             except _mysql_exceptions.IntegrityError, e:
                                 # Handle the possibility for Twitter having sent us a duplicate
                                 Print("Duplicate tweet ID:", e)
                         else:
                             Print(
                                 "Discarding tweet - length limit exceeded")
                             tweetcontents = ""
                             homedir = os.path.expanduser("~")
                             if os.path.exists(homedir +
                                               "/oversizedtweets.conf"):
                                 try:
                                     file = open(
                                         homedir + "/oversizedtweets.conf",
                                         'r')
                                     tweetcontents = file.read()
                                     file.close()
                                 except IOError, e:
                                     Print(
                                         "Failed to load oversized tweet cache - it will be overwritten"
                                     )
                             try:
                                 file = open(
                                     homedir + "/oversizedtweets.conf", 'w')
                                 tweetcontents = tweetcontents + tweet
                                 file.write(tweetcontents)
                                 file.close()
                             except IOError, e:
                                 Print(
                                     "Failed to save oversized tweet cache")
Beispiel #26
0
            except urllib2.HTTPError, e:
                Print("PeopleSearch HTTP error:", e.code)
                #                sys.stderr.write('PeopleSearch HTTP error: ' + str(e.code) + '\n')
                conn1 = False
            except urllib2.URLError, e:
                Print("PeopleSearch URL error: ", e.reason)
                #                sys.stderr.write('PeopleSearch URL error: ' + str(e.reason) + '\n')
                conn1 = False

            if conn1:
                content = conn1.read()
                conn1.close()

                request_token = dict(urlparse.parse_qsl(content))

                Print("Request Token:")
                Print("     - oauth_token        = ",
                      request_token['oauth_token'])
                Print("     - oauth_token_secret = ",
                      request_token['oauth_token_secret'])
                Print("")

                # The user must confirm authorisation so a URL is Printed here
                Print("Go to the following link in your browser:")
                Print("%s?oauth_token=%s" %
                      (authorize_url, request_token['oauth_token']))
                Print("")

                accepted = 'n'
                # Wait until the user has confirmed authorisation
                while accepted.lower() == 'n':
Beispiel #27
0
    def main(self):
        self.dbConnect()
        oldkeywords = None
        while not self.finished():
            Print ("### Checking current programmes ###")
            if self.channel != "all":
                oldpid = self.channels[self.channel]
                if oldpid == None:
                    self.db_update("""UPDATE programmes SET imported = 1 WHERE channel = %s""",(self.channel))
                data = self.doStuff(self.channel)
                if data != None:
                    keywords = data[0]
                    pid = data[1][0]
                    title = data[1][1]
                    offset = data[1][2]
                    duration = data[1][3]
                    timestamp = data[1][4]
                    utcoffset = data[1][5]
                    self.db_update("""UPDATE programmes SET imported = 1 WHERE pid != %s AND channel = %s""",(pid,self.channel))
                    self.db_select("""SELECT channel FROM programmes WHERE pid = %s AND timestamp = %s""",(pid,timestamp))
                    progentrytest = self.db_fetchone()
                    self.db_select("""SELECT duration FROM programmes_unique WHERE pid = %s""",(pid))
                    progtest2 = self.db_fetchone()
                    if progentrytest == None:
                        self.db_insert("""INSERT INTO programmes (pid,timediff,timestamp,utcoffset,channel) VALUES (%s,%s,%s)""", (pid,offset,timestamp,utcoffset,self.channel))
                        if progtest2 == None:
                            self.db_insert("""INSERT INTO programmes_unique (pid,title,duration) VALUES (%s,%s,%s)""", (pid,title,duration))
                            for word in keywords:
                                self.db_insert("""INSERT INTO keywords (pid,keyword,type) VALUES (%s,%s,%s)""", (pid,word,keywords[word]))
                    else:
                        # Fix for programmes where the duration is changed last minute
                        if progtest2[0] < duration:
                            #self.db_update("""UPDATE programmes SET duration = %s WHERE pid = %s AND timestamp = %s""",(duration,pid,timestamp))
                            self.db_update("""UPDATE programmes_unique SET duration = %s WHERE pid = %s""",(duration,pid))
                    keywords = list()
                else:
                    keywords = None

                self.db_select("""SELECT keyword FROM keywords WHERE pid = %s""",(pid))
                keywordquery = self.db_fetchall()
                for keyword in keywordquery:
                    # This ^ is a temporary fix until I work out a better DB structure
                    if "^" in keyword[0]:
                        keywords.append(string.replace(keyword[0],"^"," "))
                    else:
                        keywords.append(keyword[0])

                if (keywords != oldkeywords) & (keywords != None):
                    Print(keywords)
                    self.send([keywords,[pid]],"outbox")
                    pass
                
            else:
                # Still need to fix the 'changed to - off air' problem, but it isn't causing twitter keyword redos thankfully (purely a Printing error)
                # Possible issue will start to occur if programmes change too often - tweet stream will miss too much
                keywords = list()
                for channel in self.channels:
                    oldpid = self.channels[channel]
                    if oldpid == None:
                        self.db_update("""UPDATE programmes SET imported = 1 WHERE channel = %s""",(channel))
                    data = self.doStuff(channel)
                    if data != None:
                        keywordappender = data[0]
                        pid = data[1][0]
                        title = data[1][1]
                        offset = data[1][2]
                        duration = data[1][3]
                        timestamp = data[1][4]
                        utcoffset = data[1][5]
                        self.db_update("""UPDATE programmes SET imported = 1 WHERE pid != %s AND channel = %s""",(pid,channel))
                        self.db_select("""SELECT channel FROM programmes WHERE pid = %s AND timestamp = %s""",(pid,timestamp))
                        progentrytest = self.db_fetchone()
                        self.db_select("""SELECT duration FROM programmes_unique WHERE pid = %s""",(pid))
                        progtest2 = self.db_fetchone()
                        if progentrytest == None:
                            self.db_insert("""INSERT INTO programmes (pid,timediff,timestamp,utcoffset,channel) VALUES (%s,%s,%s,%s,%s)""", (pid,offset,timestamp,utcoffset,channel))
                            if progtest2 == None:
                                self.db_insert("""INSERT INTO programmes_unique (pid,title,duration) VALUES (%s,%s,%s)""", (pid,title,duration))
                                for word in keywordappender:
                                    self.db_insert("""INSERT INTO keywords (pid,keyword,type) VALUES (%s,%s,%s)""", (pid,word,keywordappender[word]))
                        else:
                            # Fix for programmes where the duration is changed last minute
                            if progtest2[0] < duration:
                                #self.db_update("""UPDATE programmes SET duration = %s WHERE pid = %s AND timestamp = %s""",(duration,pid,timestamp))
                                self.db_update("""UPDATE programmes_unique SET duration = %s WHERE pid = %s""",(duration,pid))

                currentpids = list()
                for channel in self.channels:
                    if self.channels[channel] != "" and self.channels[channel] != None:
                        currentpids.append(self.channels[channel])

                for pid in currentpids:
                    self.db_select("""SELECT keyword FROM keywords WHERE pid = %s""",(pid))
                    keywordquery = self.db_fetchall()
                    for keyword in keywordquery:
                        # This ^ is a temporary fix until I work out a better DB structure
                        if "^" in keyword[0]:
                            keywords.append(string.replace(keyword[0],"^"," "))
                        else:
                            keywords.append(keyword[0])

                # Remove repeated keywords here
                if len(keywords) != 0:
                    keywords = list(set(keywords))

                if (keywords != oldkeywords) & (len(keywords) != 0):
                    Print(keywords)
                    self.send([keywords,currentpids],"outbox") #epicfail: now need to send all pids, and search through them further down the line
                    pass


            oldkeywords = keywords
            # At this point, find the version tags to allow further info finding
            # Then, pass keywords to TwitterStream. DataCollector will pick up the data
            # Must deal with errors passed back from TwitterStream here
            self.firstrun = False
            time.sleep(30) # Wait for 30 secs - don't need as much given the wait time between /programmes requests
            # Could always get this to wait until the programme is due to change, but this *may* miss last minute schedule changes
            
 def main(self):
     Print( "Pausing", self.tag )
     self.pause(1)
     Print( "Pausing", self.tag )
     self.send(producerFinished(), "signal")
Beispiel #29
0
def ShouldWeRestartBookmarks(historical_status):
    deltas = {
        "keywords": [],
        "wordanalysis": [],
        "rawdata": [],
        "analyseddata": [],
        "timestamp": []
    }
    k = 1

    if len(historical_status) < 2:
        # Not enough information
        return False

    while k < len(historical_status):
        deltas["keywords"].append(historical_status[k]["keywords"] -
                                  historical_status[k - 1]["keywords"])
        deltas["wordanalysis"].append(historical_status[k]["wordanalysis"] -
                                      historical_status[k - 1]["wordanalysis"])
        deltas["rawdata"].append(historical_status[k]["rawdata"] -
                                 historical_status[k - 1]["rawdata"])
        deltas["analyseddata"].append(historical_status[k]["analyseddata"] -
                                      historical_status[k - 1]["analyseddata"])
        deltas["timestamp"].append(
            int(historical_status[k]["timestamp"] -
                historical_status[k - 1]["timestamp"]))
        k += 1

    import pprint
    pprint.pprint(deltas)

    last_hour_schedule_activity = sum(deltas["keywords"][-6:])
    last_2periods_tweet_collation_activity = sum(deltas["rawdata"][-2:])
    last_2periods_wordanalysis_activity = sum(deltas["wordanalysis"][-2:])
    last_2periods_analyseddata_activity = sum(deltas["analyseddata"][-2:])

    all_current_activity = deltas["keywords"][-1], deltas["rawdata"][
        -1], deltas["wordanalysis"][-1], deltas["analyseddata"][-1]
    if all_current_activity == 0:
        Print(
            "Bookmarks.py is showing no activity at all, of any kind, very likey dead"
        )
        return True

    if last_hour_schedule_activity == 0:
        if len(deltas["keywords"]) > 5:
            Print(
                "Looks like schedule collation in Bookmarks.py has died, needs restart"
            )
            return True

    if last_2periods_tweet_collation_activity == 0:
        if len(deltas["rawdata"]) > 1:
            Print(
                "Looks like tweet collection activity in Bookmarks.py has died, needs restart"
            )
            return True

    if last_2periods_wordanalysis_activity == 0:
        if len(deltas["wordanalysis"]) > 1:
            Print(
                "Looks like word analysis of tweets - activity - in Bookmarks.py has died, needs restart"
            )
            return True

    if last_2periods_analyseddata_activity == 0:
        if len(deltas["analyseddata"]) > 1:
            Print(
                "No tweets analysed in 2 periods. In all likelihood analysis subsystem in Bookmarks.py has died, needs restart"
            )
            return True

    # Warnings
    if deltas["rawdata"][-1] == 0:
        Print(
            "WARNING - no tweets collected in 1 period, might be dead. Waiting 1 period"
        )
    if deltas["wordanalysis"][-1] == 0:
        Print(
            "WARNING - no tweet words analysed in 1 period, might be dead. Waiting 1 period"
        )
    if deltas["analyseddata"][-1] == 0:
        Print(
            "WARNING - no tweets analysed in 1 period, might be dead. Waiting 1 period"
        )

    return False
Beispiel #30
0
                    # Finished analysis - update DB
                    self.db_update(
                        """UPDATE programmes SET totaltweets = %s, meantweets = %s, mediantweets = %s, modetweets = %s, stdevtweets = %s WHERE pid = %s AND timestamp = %s""",
                        (totaltweets, meantweets, mediantweets, modetweets,
                         stdevtweets, pid, timestamp))

                else:
                    pass
                    # Print("Analysis component: Skipping tweet - falls outside the programme's running time")

                # Mark the tweet as analysed
                self.db_update(
                    """UPDATE rawdata SET analysed = 1 WHERE tid = %s""",
                    (tid))
                Print("Analysis component: Done!")

            # Stage 2: If all raw tweets analysed and imported = 1 (all data for this programme stored and programme finished), finalise the analysis - could do bookmark identification here too?
            self.db_select(
                """SELECT pid,totaltweets,meantweets,mediantweets,modetweets,stdevtweets,timestamp,timediff FROM programmes WHERE imported = 1 AND analysed = 0 LIMIT 5000"""
            )
            data = self.db_fetchall()
            # Cycle through each programme that's ready for final analysis
            for result in data:
                pid = result[0]
                self.db_select(
                    """SELECT duration,title FROM programmes_unique WHERE pid = %s""",
                    (pid))
                data2 = self.db_fetchone()
                if not data2:
                    Print("Getting data for duration,title, etc failed - pid",