Beispiel #1
0
    def fetch_timeline(self, session):
        print "Fetching timeline for user %d" % self.id
        start_time = time.time()
        # authenticating here ensures a different set of credentials
        # everytime we start processing a new county, to prevent hitting the rate limit
        self.timeline = []

        page = 1
        done = False
        while not done:
            TW_API = API_HANDLER.get_fresh_connection()
            try:
                tweets = TW_API.user_timeline(user_id=self.id, page=page)
            except Exception, e:
                if e.message == u'Not authorized.':
                    self.is_authorized = False
                    break
                else:
                    print("Error: %s" % e.message)
                    print "waiting..."
                    time.sleep(10)
                    continue

            if not tweets:
                # All done
                break
            else:
                for t in tweets:
                    if t.created_at > DATE_UPPER_LIMIT:
                        continue
                    elif t.created_at > DATE_LOWER_LIMIT:
                        isretweet = False
                        if hasattr(t, 'retweeted_status'):
                            retweeted_at = t.created_at
                            t = t.retweeted_status
                            isretweet = True

                        tid = t.id
                        tweet = session.query(Tweet).get(tid)
                        if not tweet:
                            tweet = Tweet(**{
                                f: t.__getattribute__(f)
                                for f in TWEET_FIELDS
                            })
                            tweet.author_id = t.author.id
                            session.add(tweet)

                        if isretweet:
                            retweet = Retweet(userid=self.id,
                                              tweetid=t.id,
                                              retweeted_at=retweeted_at)
                            session.add(retweet)

                        self.timeline.append(tweet)

                    else:
                        done = True
                        break
            page += 1  # next page
    def fetch_timeline(self, session):
        print "Fetching timeline for user %d" % self.id
        start_time = time.time()
        # authenticating here ensures a different set of credentials
        # everytime we start processing a new county, to prevent hitting the rate limit
        self.timeline = []

        page = 1
        done = False
        while not done:
            TW_API = API_HANDLER.get_fresh_connection()
            try:
                tweets = TW_API.user_timeline(user_id=self.id, page=page)
            except Exception, e:                
                if e.message == u'Not authorized.':
                    self.is_authorized = False
                    break
                else:
                    print("Error: %s" % e.message)
                    print "waiting..."
                    time.sleep(10)
                    continue

            if not tweets:
                # All done
                break
            else:
                for t in tweets:
                    if t.created_at > DATE_UPPER_LIMIT:
                        continue
                    elif t.created_at > DATE_LOWER_LIMIT:
                        isretweet = False
                        if hasattr(t, 'retweeted_status'):
                            retweeted_at = t.created_at
                            t = t.retweeted_status
                            isretweet = True


                        tid = t.id
                        tweet = session.query(Tweet).get(tid)
                        if not tweet:
                            tweet = Tweet(**{f: t.__getattribute__(f) for f in TWEET_FIELDS})
                            tweet.author_id = t.author.id
                            session.add(tweet)
                        
                        if isretweet:
                            retweet = Retweet(userid=self.id, tweetid=t.id, retweeted_at=retweeted_at)
                            session.add(retweet)                            

                        self.timeline.append(tweet)
                           
                    else:
                        done = True
                        break
            page += 1  # next page
def fetch_retweets(user_id):
    # NOT WORKING, need to fetch entire timeline
    retweets_file = "retweets/%s.json" % user_id

    print "Fetching retweets for user %d" % user_id
    start_time = time.time()
    if not os.path.exists(retweets_file):
        # authenticating here ensures a different set of credentials
        # everytime we start processing a new county, to prevent hitting the rate limit
        retweets = []

        page = 1
        done = False
        while not done:
            TW_API = API_HANDLER.get_fresh_connection()
            try:
                rts = TW_API.retweets(user_id=user_id, page=page)
            except Exception, e:                
                if e.message == u'Not authorized.':
                    NOTAUTHORIZED.add(user_id)
                    with open(NOTAUTHORIZED_FNAME, 'wb') as f:
                        pickle.dump(NOTAUTHORIZED, f)
                    break
                else:
                    print("Error: %s" % e.message)
                    print "waiting..."
                    time.sleep(10)
                    continue

            if rts:
                for t in rts:
                    if t.created_at > RT_DATE_LIMIT:
                        retweets.append({
                                "timestamp": t.created_at.strftime("%Y/%m/%d %H:%M:%S"),
                                "text": t.text,
                                "user_id": t.user.id,
                                "id": t.id
                            })
                        json_dump_unicode(retweets, retweets_file + ".tmp")
                    else:
                        done = True
                        break
            else:
                # All done
                break
            page += 1  # next page

        if retweets:
            os.remove(retweets_file + ".tmp")
            json_dump_unicode(retweets, retweets_file)
Beispiel #4
0
def fetch_tweets(db, hashtags, posicion, desde, hasta):
    coll = db.abortolegal
    query = ' OR '.join(hashtags[posicion])
    count = 0
    for status in tweepy.Cursor(TW.conn_.search,
                                q=query,
                                tweet_mode='extended',
                                count=100,
                                result_type="recent",
                                since=desde,
                                until=hasta,
                                lang="es").items():
        data = status._json
        data['posicion'] = posicion

        if not coll.find({'id': data['id']}).count():
            coll.insert(data)

        count += 1
        if count % 500 == 0:
            print count
        if count % 10000 == 0:
            TW.get_fresh_connection()
Beispiel #5
0
    def fetch_favorites(self, session):
        print "Fetching favorites for user %d" % self.id
        start_time = time.time()
        self.favs = []

        page = 1
        done = False
        while not done:
            TW_API = API_HANDLER.get_fresh_connection()
            try:
                tweets = TW_API.favorites(user_id=self.id, page=page)
            except Exception, e:
                if e.message == u'Not authorized.':
                    self.is_authorized = False
                    break
                else:
                    print("Error: %s" % e.message)
                    print "waiting..."
                    time.sleep(10)
                    continue

            if not tweets:
                # All done
                break
            else:
                for t in tweets:
                    if t.created_at > DATE_UPPER_LIMIT:
                        continue
                    elif t.created_at > DATE_LOWER_LIMIT:
                        tid = t.id
                        tweet = session.query(Tweet).get(tid)
                        if not tweet:
                            tweet = Tweet(**{
                                f: t.__getattribute__(f)
                                for f in TWEET_FIELDS
                            })
                            tweet.author_id = t.author.id
                            session.add(tweet)
                            self.favs.append(tweet)
                    else:
                        done = True
                        break
            page += 1  # next page
    def fetch_favorites(self, session):
        print "Fetching favorites for user %d" % self.id
        start_time = time.time()
        self.favs = []

        page = 1
        done = False
        while not done:
            TW_API = API_HANDLER.get_fresh_connection()
            try:
                tweets = TW_API.favorites(user_id=self.id, page=page)
            except Exception, e:                
                if e.message == u'Not authorized.':
                    self.is_authorized = False
                    break
                else:
                    print("Error: %s" % e.message)
                    print "waiting..."
                    time.sleep(10)
                    continue

            if not tweets:
                # All done
                break
            else:
                for t in tweets:
                    if t.created_at > DATE_UPPER_LIMIT:
                        continue
                    elif t.created_at > DATE_LOWER_LIMIT:
                        tid = t.id
                        tweet = session.query(Tweet).get(tid)
                        if not tweet:
                            tweet = Tweet(**{f: t.__getattribute__(f) for f in TWEET_FIELDS})
                            tweet.author_id = t.author.id
                            session.add(tweet)
                            self.favs.append(tweet)                            
                    else:
                        done = True
                        break
            page += 1  # next page
    # initialize_db()
    
    import networkx as nx
    graph = nx.read_gpickle('subgraph.gpickle')
    
    session = open_session()

    gids = [int(x) for x in graph.nodes()]
    dbids = [x for x, in session.query(User.id).all()]
    missing = [x for x in gids if x not in dbids]

    user_ids = missing
    
    users = []

    TW = API_HANDLER.get_fresh_connection()
    for i, uid in enumerate(user_ids):
        try:
            u = TW.get_user(uid)
            user = User(id=int(uid), username=u.name)
            users.append(user)
        except Exception:
            pass
        finally:
            if (i + 1) % 20 == 0:
                TW = API_HANDLER.get_fresh_connection()

    session.add_all(users)

    session.commit()
    session.close()
Beispiel #8
0
    hashtags = {
        "si": [
            "#abortolegal",
            "#abortolegalya",
            "#abortolegalesvida",
            "#AbortoLegalEsSalud",
            "#novotencontralasmujeres",
            "#quesealey",
            "#queelabortosealey",
            "#AbortoSeraLey"  # desde 13/6
        ],
        "no": [
            "#elijamoslas2vidas",
            "#noalaborto",
            "#noalabortolegal",
            "#salvemoslasdosvidas",
            "#SalvemosLas2Vidas",
            "#ArgentinaEsProvida",
            "#CuidemoslasDosVidas",
            "#AbortoLegalEsMuerte",
            "#NoAlAbortoEnArgentina"  # desde 13/6
        ]
    }

    TW.get_fresh_connection()
    fetch_tweets(db, hashtags, "si", desde, hasta)

    TW.get_fresh_connection()
    fetch_tweets(db, hashtags, "no", desde, hasta)