def __init__(self): (sq1, rate1) = poisson.simulate(period=600) sq1 = np.array(sq1) (sq2, rate2) = hpk.simulate(180) sq2 = np.array(sq2) + 420 self.create_parameters() t0 = 1425847547. stream1 = [] for element in sq1: item = stream.RawTweetItem(t0 + 60 * element, 0, self.get_doc(self.g_topics)) stream1.append(item) stream2 = [] for element in sq2: item = stream.RawTweetItem(t0 + 60 * element, 0, self.get_doc(self.b_topics)) stream2.append(item) self.stream_all = stream1 + stream2 self.stream_all.sort(key=lambda x: x.datetime()) self.index = 0
def next(self): _t = None _user = None _tweet = None while True: try: line = self.f.next() if line.startswith('content: '): _tweet = line.split('content: ')[1] if line.startswith('userId: '): _user = eval(line.split('userId: ')[1]) if line.startswith('publishedTimeGmt: '): _t = eval(line.split('publishedTimeGmt: ')[1]) / 1000 _t += 8*60*60# 8 hours if _t > self.end: return stream.End_Of_Stream if line.startswith('-------------'): item = stream.RawTweetItem(_t, _user, _tweet) return item if line is None: return stream.End_Of_Stream except: return stream.End_Of_Stream return None
def source2tweet(source): _t = datetime.strptime(source["created_at"], '%Y-%m-%d %H:%M') _user = source["uid"] _tweet = source["text"] item = stream.RawTweetItem(_t, _user, _tweet) item.attach(source) return item
def next(self): row = self.cursor.fetchone() if row is None: self.cursor.close() self.connection.close() self.dy_start = self.dy_start + td(days=1) if self.dy_start < self.dy_end: self.connection = MySQLdb.connect(host='?', user='******', db='?', charset='utf8') self.cursor = self.connection.cursor() _time0 = self.dy_start.strftime("%Y-%m-%d") _time1 = (self.dy_start + td(days=1)).strftime("%Y-%m-%d") sql_str = 'select * from ' + 'weibo_timelines' + ' where created_at >= "%s" and created_at < "%s" order by created_at' % ( _time0, _time1) print sql_str self.cursor.execute(sql_str) row = self.cursor.fetchone() else: return stream.End_Of_Stream _obj = { "mid": row[self.id_map["mid"]], "uid": row[self.id_map["uid"]], "retweet_num": row[self.id_map["retweet_num"]], "comment_num": row[self.id_map["comment_num"]], "favourite_num": row[self.id_map["favourite_num"]], "created_at": row[self.id_map["created_at"]], "from": row[self.id_map["from"]], "text": row[self.id_map["text"]], "entity": row[self.id_map["entity"]], "source_mid": row[self.id_map["source_mid"]], "source_uid": row[self.id_map["source_uid"]], "mentions": row[self.id_map["mentions"]], "check_in": row[self.id_map["check_in"]], "check_in_url": row[self.id_map["check_in_url"]], "is_deleted": row[self.id_map["is_deleted"]], "timestamp": row[self.id_map["timestamp"]], } _t = datetime.strptime(_obj["created_at"], '%Y-%m-%d %H:%M') _user = _obj["uid"] _tweet = _obj["text"] item = stream.RawTweetItem(_t, _user, _tweet) item.attach(_obj) return item
def load(self, start, end): res = self.range(start, end) while res is None: print 'Search range failure. Sleeping 300 seconds.' time.sleep(300) res = self.range(start, end) for row in res: _obj = { "mid" : row[self.id_map["mid"]], "uid" : row[self.id_map["uid"]], "retweet_num" : row[self.id_map["retweet_num"]], "comment_num" : row[self.id_map["comment_num"]], "favourite_num" : row[self.id_map["favourite_num"]], "created_at" : row[self.id_map["created_at"]], "from" : row[self.id_map["from"]], "text" : row[self.id_map["text"]], "entity" : row[self.id_map["entity"]], "source_mid" : row[self.id_map["source_mid"]], "source_uid" : row[self.id_map["source_uid"]], "mentions" : row[self.id_map["mentions"]], "check_in" : row[self.id_map["check_in"]], "check_in_url" : row[self.id_map["check_in_url"]], "is_deleted" : row[self.id_map["is_deleted"]], "timestamp" : row[self.id_map["timestamp"]], } _t = datetime.strptime(_obj["created_at"], '%Y-%m-%d %H:%M') _user = _obj["uid"] _tweet = _obj["text"] item = stream.RawTweetItem(_t, _user, _tweet) item.attach(_obj) self.deq.append(item) print 'LOADING FINISHED.', len(self.deq), start, end dt = self.delta.seconds / 60.0 sys_monitor.report_rate(len(self.deq)/dt, end) self.start = end
def __init__(self): self.deq = deque([]) f = open('./tweets.txt', 'r') _t = None _user = None _tweet = None for line in f: if line.startswith('content: '): _tweet = line.split('content: ')[1] if line.startswith('userId: '): _user = eval(line.split('userId: ')[1]) if line.startswith('publishedTimeGmt: '): _t = eval(line.split('publishedTimeGmt: ')[1]) / 1000 _t += 8 * 60 * 60 # 8 hours if line.startswith('-------------'): item = stream.RawTweetItem(_t, _user, _tweet) self.deq.append(item) print 'LOADING FINISHED.'