Example #1
0
def is_relevant(user_id):
    user_id = str(user_id)
    if user_id in RELEVANT:
        return RELEVANT[user_id]
    else:
        retries = 0
        while True:
            try:
                TW = API_HANDLER.get_connection()
                u = TW.get_user(user_id)
                relevant = u.followers_count > 40 and u.friends_count > 40
                RELEVANT[user_id] = relevant
                with open(RELEVANT_FNAME, 'w') as f:
                    json.dump(RELEVANT, f)
                return relevant
            except Exception, e:
                print "Error in is_relevant for %s" % user_id
                retries += 1
                if retries == 5:
                    print "Gave up retrying for user %s" % user_id
                    print "(marked as not relevant)"
                    return False
                else:
                    print "waiting..."
                    time.sleep(10)
Example #2
0
def get_followed_user_ids(user=None, user_id=None):
    if user is not None:
        user_id = user.id

    if GRAPH.out_degree(user_id):
        followed = GRAPH.successors(user_id)
        return followed
    else:
        retries = 0
        while True:
     
            try:
                TW = API_HANDLER.get_connection()
                followed = TW.friends_ids(user_id=user_id)
                GRAPH.add_edges_from([(user_id, f_id) for f_id in followed])
                return followed
            except Exception, e:
                # print e
                if e.message == u'Not authorized.':
                    NOTAUTHORIZED.add(user_id)
                    with open(NOTAUTHORIZED_FNAME, 'wb') as f:
                        pickle.dump(NOTAUTHORIZED, f)
                    return []
                else:
                    print "Error for user %d: %s" % (user_id, e.message)
                    retries += 1
                    if retries == 5:
                        print "Gave up retrying for user %d" % user_id
                        return [] 
                    else:
                        print "waiting..."
                        time.sleep(10)
Example #3
0
    def fetch_timeline(self, session):
        print "Fetching timeline for user %d" % self.id
        start_time = time.time()
        # authenticating here ensures a different set of credentials
        # everytime we start processing a new county, to prevent hitting the rate limit
        self.timeline = []

        page = 1
        done = False
        while not done:
            TW_API = API_HANDLER.get_fresh_connection()
            try:
                tweets = TW_API.user_timeline(user_id=self.id, page=page)
            except Exception, e:
                if e.message == u'Not authorized.':
                    self.is_authorized = False
                    break
                else:
                    print("Error: %s" % e.message)
                    print "waiting..."
                    time.sleep(10)
                    continue

            if not tweets:
                # All done
                break
            else:
                for t in tweets:
                    if t.created_at > DATE_UPPER_LIMIT:
                        continue
                    elif t.created_at > DATE_LOWER_LIMIT:
                        isretweet = False
                        if hasattr(t, 'retweeted_status'):
                            retweeted_at = t.created_at
                            t = t.retweeted_status
                            isretweet = True

                        tid = t.id
                        tweet = session.query(Tweet).get(tid)
                        if not tweet:
                            tweet = Tweet(**{
                                f: t.__getattribute__(f)
                                for f in TWEET_FIELDS
                            })
                            tweet.author_id = t.author.id
                            session.add(tweet)

                        if isretweet:
                            retweet = Retweet(userid=self.id,
                                              tweetid=t.id,
                                              retweeted_at=retweeted_at)
                            session.add(retweet)

                        self.timeline.append(tweet)

                    else:
                        done = True
                        break
            page += 1  # next page
    def fetch_timeline(self, session):
        print "Fetching timeline for user %d" % self.id
        start_time = time.time()
        # authenticating here ensures a different set of credentials
        # everytime we start processing a new county, to prevent hitting the rate limit
        self.timeline = []

        page = 1
        done = False
        while not done:
            TW_API = API_HANDLER.get_fresh_connection()
            try:
                tweets = TW_API.user_timeline(user_id=self.id, page=page)
            except Exception, e:                
                if e.message == u'Not authorized.':
                    self.is_authorized = False
                    break
                else:
                    print("Error: %s" % e.message)
                    print "waiting..."
                    time.sleep(10)
                    continue

            if not tweets:
                # All done
                break
            else:
                for t in tweets:
                    if t.created_at > DATE_UPPER_LIMIT:
                        continue
                    elif t.created_at > DATE_LOWER_LIMIT:
                        isretweet = False
                        if hasattr(t, 'retweeted_status'):
                            retweeted_at = t.created_at
                            t = t.retweeted_status
                            isretweet = True


                        tid = t.id
                        tweet = session.query(Tweet).get(tid)
                        if not tweet:
                            tweet = Tweet(**{f: t.__getattribute__(f) for f in TWEET_FIELDS})
                            tweet.author_id = t.author.id
                            session.add(tweet)
                        
                        if isretweet:
                            retweet = Retweet(userid=self.id, tweetid=t.id, retweeted_at=retweeted_at)
                            session.add(retweet)                            

                        self.timeline.append(tweet)
                           
                    else:
                        done = True
                        break
            page += 1  # next page
Example #5
0
def agregar_conexiones(g):
    """
        Dado un grafo incompleto de usuarios
        agrega relaciones de seguir entre sus nodos
    """
    g = g.copy()
    uids_g = list(g.nodes())

    for uid in uids_g:
        print uid
        seguidos = TW.traer_seguidos(user_id=uid)
        g.add_edges_from([(uid, sid) for sid in seguidos if sid in uids_g])

    return g
Example #6
0
def fetch_retweets(user_id):
    # NOT WORKING, need to fetch entire timeline
    retweets_file = "retweets/%s.json" % user_id

    print "Fetching retweets for user %d" % user_id
    start_time = time.time()
    if not os.path.exists(retweets_file):
        # authenticating here ensures a different set of credentials
        # everytime we start processing a new county, to prevent hitting the rate limit
        retweets = []

        page = 1
        done = False
        while not done:
            TW_API = API_HANDLER.get_fresh_connection()
            try:
                rts = TW_API.retweets(user_id=user_id, page=page)
            except Exception, e:                
                if e.message == u'Not authorized.':
                    NOTAUTHORIZED.add(user_id)
                    with open(NOTAUTHORIZED_FNAME, 'wb') as f:
                        pickle.dump(NOTAUTHORIZED, f)
                    break
                else:
                    print("Error: %s" % e.message)
                    print "waiting..."
                    time.sleep(10)
                    continue

            if rts:
                for t in rts:
                    if t.created_at > RT_DATE_LIMIT:
                        retweets.append({
                                "timestamp": t.created_at.strftime("%Y/%m/%d %H:%M:%S"),
                                "text": t.text,
                                "user_id": t.user.id,
                                "id": t.id
                            })
                        json_dump_unicode(retweets, retweets_file + ".tmp")
                    else:
                        done = True
                        break
            else:
                # All done
                break
            page += 1  # next page

        if retweets:
            os.remove(retweets_file + ".tmp")
            json_dump_unicode(retweets, retweets_file)
Example #7
0
def fetch_tweets(db, hashtags, posicion, desde, hasta):
    coll = db.abortolegal
    query = ' OR '.join(hashtags[posicion])
    count = 0
    for status in tweepy.Cursor(TW.conn_.search,
                                q=query,
                                tweet_mode='extended',
                                count=100,
                                result_type="recent",
                                since=desde,
                                until=hasta,
                                lang="es").items():
        data = status._json
        data['posicion'] = posicion

        if not coll.find({'id': data['id']}).count():
            coll.insert(data)

        count += 1
        if count % 500 == 0:
            print count
        if count % 10000 == 0:
            TW.get_fresh_connection()
Example #8
0
    def fetch_favorites(self, session):
        print "Fetching favorites for user %d" % self.id
        start_time = time.time()
        self.favs = []

        page = 1
        done = False
        while not done:
            TW_API = API_HANDLER.get_fresh_connection()
            try:
                tweets = TW_API.favorites(user_id=self.id, page=page)
            except Exception, e:
                if e.message == u'Not authorized.':
                    self.is_authorized = False
                    break
                else:
                    print("Error: %s" % e.message)
                    print "waiting..."
                    time.sleep(10)
                    continue

            if not tweets:
                # All done
                break
            else:
                for t in tweets:
                    if t.created_at > DATE_UPPER_LIMIT:
                        continue
                    elif t.created_at > DATE_LOWER_LIMIT:
                        tid = t.id
                        tweet = session.query(Tweet).get(tid)
                        if not tweet:
                            tweet = Tweet(**{
                                f: t.__getattribute__(f)
                                for f in TWEET_FIELDS
                            })
                            tweet.author_id = t.author.id
                            session.add(tweet)
                            self.favs.append(tweet)
                    else:
                        done = True
                        break
            page += 1  # next page
    def fetch_favorites(self, session):
        print "Fetching favorites for user %d" % self.id
        start_time = time.time()
        self.favs = []

        page = 1
        done = False
        while not done:
            TW_API = API_HANDLER.get_fresh_connection()
            try:
                tweets = TW_API.favorites(user_id=self.id, page=page)
            except Exception, e:                
                if e.message == u'Not authorized.':
                    self.is_authorized = False
                    break
                else:
                    print("Error: %s" % e.message)
                    print "waiting..."
                    time.sleep(10)
                    continue

            if not tweets:
                # All done
                break
            else:
                for t in tweets:
                    if t.created_at > DATE_UPPER_LIMIT:
                        continue
                    elif t.created_at > DATE_LOWER_LIMIT:
                        tid = t.id
                        tweet = session.query(Tweet).get(tid)
                        if not tweet:
                            tweet = Tweet(**{f: t.__getattribute__(f) for f in TWEET_FIELDS})
                            tweet.author_id = t.author.id
                            session.add(tweet)
                            self.favs.append(tweet)                            
                    else:
                        done = True
                        break
            page += 1  # next page
Example #10
0
def get_follower_counts(user_id):
    TW = API_HANDLER.get_connection()
    u = TW.get_user(user_id)
    return u.followers_count
    # initialize_db()
    
    import networkx as nx
    graph = nx.read_gpickle('subgraph.gpickle')
    
    session = open_session()

    gids = [int(x) for x in graph.nodes()]
    dbids = [x for x, in session.query(User.id).all()]
    missing = [x for x in gids if x not in dbids]

    user_ids = missing
    
    users = []

    TW = API_HANDLER.get_fresh_connection()
    for i, uid in enumerate(user_ids):
        try:
            u = TW.get_user(uid)
            user = User(id=int(uid), username=u.name)
            users.append(user)
        except Exception:
            pass
        finally:
            if (i + 1) % 20 == 0:
                TW = API_HANDLER.get_fresh_connection()

    session.add_all(users)

    session.commit()
    session.close()
Example #12
0
    hashtags = {
        "si": [
            "#abortolegal",
            "#abortolegalya",
            "#abortolegalesvida",
            "#AbortoLegalEsSalud",
            "#novotencontralasmujeres",
            "#quesealey",
            "#queelabortosealey",
            "#AbortoSeraLey"  # desde 13/6
        ],
        "no": [
            "#elijamoslas2vidas",
            "#noalaborto",
            "#noalabortolegal",
            "#salvemoslasdosvidas",
            "#SalvemosLas2Vidas",
            "#ArgentinaEsProvida",
            "#CuidemoslasDosVidas",
            "#AbortoLegalEsMuerte",
            "#NoAlAbortoEnArgentina"  # desde 13/6
        ]
    }

    TW.get_fresh_connection()
    fetch_tweets(db, hashtags, "si", desde, hasta)

    TW.get_fresh_connection()
    fetch_tweets(db, hashtags, "no", desde, hasta)