def __init__(self, hashtags, session, engine): StreamListener.__init__(self) self.cpt = 0 # FIXME: test if useful self.eu = EncodingUtils() self.hashtags = self.format_hashtags(hashtags) self.session = session # bridge to the db self.engine = engine
def __init__(self, author, created, inserted, source, text): self.eu = EncodingUtils() # used to switch to unicode self.author = self.eu.to_unicode(author) self.created = self.eu.to_unicode(created) self.crawled = False self.inserted = inserted self.source = self.eu.to_unicode(source) self.hashtag = self.eu.to_unicode('') self.text = self.eu.to_unicode(text) self.hashtags = self.extract_hashtags() self.invalid = False # cannot be invalid by default
#! /usr/bin/env python # ! coding=utf-8 # ! author scq000 from pyrailgun import RailGun import json import sys from encodingUtils import EncodingUtils reload(sys) sys.setdefaultencoding('utf8') encodingUtils = EncodingUtils() railgun = RailGun(encodingUtils) railgun.setTask(file("sites.json")) railgun.fire() nodes = railgun.getShells('default') file = file("result.txt", "w+") for item in nodes: node = nodes[item] # print node file.write(node.get('name', [""])[0] + "\r\n") file.write(node.get('src', [""])[0] + "\r\n") file.write(node.get('magnet', [""])[0] + "\r\n") file.write(node.get('thunder', [""])[0] + "\r\n") file.write(node.get('size', [""])[0] + "\r\n") file.write( node.get('updateTime', [""])[0] + "\r\n====================================\n")
class StreamSaverListener(StreamListener): """ Stream that will save each tweet it receives into a database to be reused later """ def __init__(self, hashtags, session, engine): StreamListener.__init__(self) self.cpt = 0 # FIXME: test if useful self.eu = EncodingUtils() self.hashtags = self.format_hashtags(hashtags) self.session = session # bridge to the db self.engine = engine def on_status(self, status): """ Each time a tweet is received """ tweet = Tweet(status.author.screen_name, status.created_at, datetime.datetime.now(), status.source, status.text) tweet.get_main_tag(self.hashtags) # FIXME: should be part of the init, shouldn t it ? # adds current tweet to the tweet table for logging. self.session.add(tweet) # here i should update members now. self.update_members(tweet) #self.cpt += 1 if self.cpt >= 1: self.session.commit() # force saving changes self.cpt = 0 def on_error(self, status_code): print 'An error has occured! Status code = %s' % status_code return True # keeps stream alive def on_timeout(self): print 'Snoozing Zzzzzz' def on_delete(self): return False def format_hashtags(self, hashs): """ Returns the same list of hashtags in unicode format """ return [self.eu.to_unicode(has) for has in hashs] def update_members(self, tweet): """ Updates the member table using the last tweet received. If Member already exists and has already used the hashtag, its counter will be incremented. If member doesnt exist yet for the hashtag, it will be created. """ auth = tweet.author hasht = tweet.hashtag m_query = self.session.query(Member).filter(Member.author == auth).filter(Member.hashtag == hasht) reslen = len(m_query.all()) if reslen > 1: print "Error: Duplicate members found." elif reslen == 0: print "No member found, creating" self.create_member(tweet) else: # reslen = 1 print "Member found, updating" self.update_member(m_query.first()) def create_member(self, tweet): """ Creates a new Member using data from the given Tweet Called when no Member is found for the current author/hashtag couple. """ if (tweet.has_author() and tweet.has_hashtag()): member = Member(tweet.author, tweet.hashtag, 1) self.session.add(member) self.cpt += 1 else: #self.logger.error("ElementException : Cannot create Member, Tweet is not valid !") print "ElementException : Cannot create Member, Tweet is not valid !" #raise ElementException # FIXME : Take care pass def update_member(self, member): """ Updates member values. Increments counter by 1, and changes updated field """ if (member.has_author() and member.has_hashtag()): member.update() self.session.add(member) self.cpt += 1 else: #self.logger.error("ElementException : Cannot update Member, Member is not valid !") print "ElementException : Cannot update Member, Member is not valid !" raise ElementException # FIXME : Take care
class Tweet(Base): """ Class that fully represents a tweet as it is stored in the database. It is different from the structure that can be found in tweepy """ __tablename__ = "tweet" id = Column(Integer, primary_key=True) hashtag = Column(String(200)) # Hashtag that is tracked text = Column(String(200)) # Content of the tweet author = Column(String(200)) # name of the tweeter created = Column(String(200)) # FIXME: Change to date. Date at which message was tweeted inserted = Column(DateTime) # Date at which tweet was saved in db crawled = Column(Boolean) # Boolean whether or not tweet is in statistics already source = Column(String(200)) # Where tweet comes from # Boolean that is set to True if Tweet cannot be processed correctly invalid = Column(Boolean) def __init__(self, author, created, inserted, source, text): self.eu = EncodingUtils() # used to switch to unicode self.author = self.eu.to_unicode(author) self.created = self.eu.to_unicode(created) self.crawled = False self.inserted = inserted self.source = self.eu.to_unicode(source) self.hashtag = self.eu.to_unicode('') self.text = self.eu.to_unicode(text) self.hashtags = self.extract_hashtags() self.invalid = False # cannot be invalid by default def extract_hashtags(self): """ Extracts all the hashtags that are present in the tweet FIXME: Problem here is that we lose lots of tags because they end/start with special characters! """ return set(part[:] for part in self.text.split() if part.startswith('#')) #return re.findall(r"#(\w+)", self.text) def get_main_tag(self, trendy): """ Given a list of tracked hashtag, defines the most important one """ in_hashs = [i.lower() for i in self.hashtags] trend_hashs = [i.lower() for i in trendy] match = [i for i in in_hashs if i in trend_hashs] if len(match) != 0: self.hashtag = self.eu.to_unicode(match[0]) def has_author(self): """ Returns True if author is not empty or null """ return (len(self.author) != 0 and self.author is not None) def has_hashtag(self): """ Returns True if hashtag is not empty or null """ return (len(self.hashtag) != 0 and self.hashtag is not None) def __repr__(self): try: return "<%s('%s','%s', '%s')>" % (self.author.encode('utf-8'), self.created.encode('utf-8'), self.hashtag.encode('utf-8'), self.text.encode('utf-8')) except UnicodeDecodeError: return "Contains Unicode!!"