def is_relevant(user_id): user_id = str(user_id) if user_id in RELEVANT: return RELEVANT[user_id] else: retries = 0 while True: try: TW = API_HANDLER.get_connection() u = TW.get_user(user_id) relevant = u.followers_count > 40 and u.friends_count > 40 RELEVANT[user_id] = relevant with open(RELEVANT_FNAME, 'w') as f: json.dump(RELEVANT, f) return relevant except Exception, e: print "Error in is_relevant for %s" % user_id retries += 1 if retries == 5: print "Gave up retrying for user %s" % user_id print "(marked as not relevant)" return False else: print "waiting..." time.sleep(10)
def get_followed_user_ids(user=None, user_id=None): if user is not None: user_id = user.id if GRAPH.out_degree(user_id): followed = GRAPH.successors(user_id) return followed else: retries = 0 while True: try: TW = API_HANDLER.get_connection() followed = TW.friends_ids(user_id=user_id) GRAPH.add_edges_from([(user_id, f_id) for f_id in followed]) return followed except Exception, e: # print e if e.message == u'Not authorized.': NOTAUTHORIZED.add(user_id) with open(NOTAUTHORIZED_FNAME, 'wb') as f: pickle.dump(NOTAUTHORIZED, f) return [] else: print "Error for user %d: %s" % (user_id, e.message) retries += 1 if retries == 5: print "Gave up retrying for user %d" % user_id return [] else: print "waiting..." time.sleep(10)
def fetch_timeline(self, session): print "Fetching timeline for user %d" % self.id start_time = time.time() # authenticating here ensures a different set of credentials # everytime we start processing a new county, to prevent hitting the rate limit self.timeline = [] page = 1 done = False while not done: TW_API = API_HANDLER.get_fresh_connection() try: tweets = TW_API.user_timeline(user_id=self.id, page=page) except Exception, e: if e.message == u'Not authorized.': self.is_authorized = False break else: print("Error: %s" % e.message) print "waiting..." time.sleep(10) continue if not tweets: # All done break else: for t in tweets: if t.created_at > DATE_UPPER_LIMIT: continue elif t.created_at > DATE_LOWER_LIMIT: isretweet = False if hasattr(t, 'retweeted_status'): retweeted_at = t.created_at t = t.retweeted_status isretweet = True tid = t.id tweet = session.query(Tweet).get(tid) if not tweet: tweet = Tweet(**{ f: t.__getattribute__(f) for f in TWEET_FIELDS }) tweet.author_id = t.author.id session.add(tweet) if isretweet: retweet = Retweet(userid=self.id, tweetid=t.id, retweeted_at=retweeted_at) session.add(retweet) self.timeline.append(tweet) else: done = True break page += 1 # next page
def fetch_timeline(self, session): print "Fetching timeline for user %d" % self.id start_time = time.time() # authenticating here ensures a different set of credentials # everytime we start processing a new county, to prevent hitting the rate limit self.timeline = [] page = 1 done = False while not done: TW_API = API_HANDLER.get_fresh_connection() try: tweets = TW_API.user_timeline(user_id=self.id, page=page) except Exception, e: if e.message == u'Not authorized.': self.is_authorized = False break else: print("Error: %s" % e.message) print "waiting..." time.sleep(10) continue if not tweets: # All done break else: for t in tweets: if t.created_at > DATE_UPPER_LIMIT: continue elif t.created_at > DATE_LOWER_LIMIT: isretweet = False if hasattr(t, 'retweeted_status'): retweeted_at = t.created_at t = t.retweeted_status isretweet = True tid = t.id tweet = session.query(Tweet).get(tid) if not tweet: tweet = Tweet(**{f: t.__getattribute__(f) for f in TWEET_FIELDS}) tweet.author_id = t.author.id session.add(tweet) if isretweet: retweet = Retweet(userid=self.id, tweetid=t.id, retweeted_at=retweeted_at) session.add(retweet) self.timeline.append(tweet) else: done = True break page += 1 # next page
def agregar_conexiones(g): """ Dado un grafo incompleto de usuarios agrega relaciones de seguir entre sus nodos """ g = g.copy() uids_g = list(g.nodes()) for uid in uids_g: print uid seguidos = TW.traer_seguidos(user_id=uid) g.add_edges_from([(uid, sid) for sid in seguidos if sid in uids_g]) return g
def fetch_retweets(user_id): # NOT WORKING, need to fetch entire timeline retweets_file = "retweets/%s.json" % user_id print "Fetching retweets for user %d" % user_id start_time = time.time() if not os.path.exists(retweets_file): # authenticating here ensures a different set of credentials # everytime we start processing a new county, to prevent hitting the rate limit retweets = [] page = 1 done = False while not done: TW_API = API_HANDLER.get_fresh_connection() try: rts = TW_API.retweets(user_id=user_id, page=page) except Exception, e: if e.message == u'Not authorized.': NOTAUTHORIZED.add(user_id) with open(NOTAUTHORIZED_FNAME, 'wb') as f: pickle.dump(NOTAUTHORIZED, f) break else: print("Error: %s" % e.message) print "waiting..." time.sleep(10) continue if rts: for t in rts: if t.created_at > RT_DATE_LIMIT: retweets.append({ "timestamp": t.created_at.strftime("%Y/%m/%d %H:%M:%S"), "text": t.text, "user_id": t.user.id, "id": t.id }) json_dump_unicode(retweets, retweets_file + ".tmp") else: done = True break else: # All done break page += 1 # next page if retweets: os.remove(retweets_file + ".tmp") json_dump_unicode(retweets, retweets_file)
def fetch_tweets(db, hashtags, posicion, desde, hasta): coll = db.abortolegal query = ' OR '.join(hashtags[posicion]) count = 0 for status in tweepy.Cursor(TW.conn_.search, q=query, tweet_mode='extended', count=100, result_type="recent", since=desde, until=hasta, lang="es").items(): data = status._json data['posicion'] = posicion if not coll.find({'id': data['id']}).count(): coll.insert(data) count += 1 if count % 500 == 0: print count if count % 10000 == 0: TW.get_fresh_connection()
def fetch_favorites(self, session): print "Fetching favorites for user %d" % self.id start_time = time.time() self.favs = [] page = 1 done = False while not done: TW_API = API_HANDLER.get_fresh_connection() try: tweets = TW_API.favorites(user_id=self.id, page=page) except Exception, e: if e.message == u'Not authorized.': self.is_authorized = False break else: print("Error: %s" % e.message) print "waiting..." time.sleep(10) continue if not tweets: # All done break else: for t in tweets: if t.created_at > DATE_UPPER_LIMIT: continue elif t.created_at > DATE_LOWER_LIMIT: tid = t.id tweet = session.query(Tweet).get(tid) if not tweet: tweet = Tweet(**{ f: t.__getattribute__(f) for f in TWEET_FIELDS }) tweet.author_id = t.author.id session.add(tweet) self.favs.append(tweet) else: done = True break page += 1 # next page
def fetch_favorites(self, session): print "Fetching favorites for user %d" % self.id start_time = time.time() self.favs = [] page = 1 done = False while not done: TW_API = API_HANDLER.get_fresh_connection() try: tweets = TW_API.favorites(user_id=self.id, page=page) except Exception, e: if e.message == u'Not authorized.': self.is_authorized = False break else: print("Error: %s" % e.message) print "waiting..." time.sleep(10) continue if not tweets: # All done break else: for t in tweets: if t.created_at > DATE_UPPER_LIMIT: continue elif t.created_at > DATE_LOWER_LIMIT: tid = t.id tweet = session.query(Tweet).get(tid) if not tweet: tweet = Tweet(**{f: t.__getattribute__(f) for f in TWEET_FIELDS}) tweet.author_id = t.author.id session.add(tweet) self.favs.append(tweet) else: done = True break page += 1 # next page
def get_follower_counts(user_id): TW = API_HANDLER.get_connection() u = TW.get_user(user_id) return u.followers_count
# initialize_db() import networkx as nx graph = nx.read_gpickle('subgraph.gpickle') session = open_session() gids = [int(x) for x in graph.nodes()] dbids = [x for x, in session.query(User.id).all()] missing = [x for x in gids if x not in dbids] user_ids = missing users = [] TW = API_HANDLER.get_fresh_connection() for i, uid in enumerate(user_ids): try: u = TW.get_user(uid) user = User(id=int(uid), username=u.name) users.append(user) except Exception: pass finally: if (i + 1) % 20 == 0: TW = API_HANDLER.get_fresh_connection() session.add_all(users) session.commit() session.close()
hashtags = { "si": [ "#abortolegal", "#abortolegalya", "#abortolegalesvida", "#AbortoLegalEsSalud", "#novotencontralasmujeres", "#quesealey", "#queelabortosealey", "#AbortoSeraLey" # desde 13/6 ], "no": [ "#elijamoslas2vidas", "#noalaborto", "#noalabortolegal", "#salvemoslasdosvidas", "#SalvemosLas2Vidas", "#ArgentinaEsProvida", "#CuidemoslasDosVidas", "#AbortoLegalEsMuerte", "#NoAlAbortoEnArgentina" # desde 13/6 ] } TW.get_fresh_connection() fetch_tweets(db, hashtags, "si", desde, hasta) TW.get_fresh_connection() fetch_tweets(db, hashtags, "no", desde, hasta)