Exemple #1
0
    def generate_tasks(self):
        """
        This method does not generate tasks. It filters the current twitter clients
        and updates the clients_queue continuously.
        """
        all_profiles = UserProfile.objects.filter(
            twitterApp_parameters_error=False)
        clients_list = get_client_list(all_profiles)
        all_profiles = all_profiles.filter(
            twitterApp_parameters_error=False
        )  # 2 times insures the Twitter app is valid

        if len(all_profiles) == 0:
            log('No valid Twitter client exists!')
            for profile in UserProfile.objects.all():
                profile.twitterApp_parameters_error = False
                profile.save()
            raise MailReportableException(
                'Twitter harvest has not launched',
                'No valid Twitter client exists! (reseting them all)')
        clients_queue.maxsize = len(clients_list)
        clear_twitter_client_queue()
        log('Valid Twitter clients: %s' %
            [str(client) for client in clients_list])
        for client in clients_list:
            clients_queue.put(client)

        yield None
Exemple #2
0
def harvest_twitter_user(twitter_user_harvester):
    twitter_user = twitter_user_harvester.twitter_user
    cursor = CustomCursor('user_timeline', id=twitter_user._ident, count=200)
    log('harvesting {} tweets from {} to {}'.format(
        twitter_user,
        twitter_user_harvester.harvest_since.strftime("%Y-%m-%d"),
        twitter_user_harvester.harvest_until.strftime("%Y-%m-%d")))
    none_received_count = 0
    while True:
        tweet = cursor.next()
        if not tweet:
            none_received_count += 1
            if none_received_count > 10:
                break
            continue
        else:
            none_received_count = 0

        created_at = tweet.created_at.replace(hour=0,
                                              minute=0,
                                              second=0,
                                              microsecond=0,
                                              tzinfo=utc)
        if created_at <= twitter_user_harvester.harvest_until:
            global_task_queue.add(update_tweet_from_response, [tweet])

        if created_at < twitter_user_harvester.harvest_since:
            break
    log('Tweet-harvest completed for {}'.format(twitter_user_harvester))
    twitter_user_harvester.harvest_completed = True
    twitter_user_harvester.save()
def _fetch_tweets_from_html(term, since, until):
    monitor_stop_flag()
    url = 'https://twitter.com/search?q={} since%3A{} until%3A{}'.format(
        term, since.strftime("%Y-%m-%d"), until.strftime("%Y-%m-%d"))
    log(url)
    request = Request(url, headers={
        'User-Agent': random.choice(BROWSER_USER_AGENTS),
        'Host': 'twitter.com',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,fr-CA;q=0.8,en;q=0.5,fr;q=0.3',
        'Accept-Encoding': 'gzip, deflate, br',
        'Referer': 'https://twitter.com/',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Cache-Control': 'max-age=0, no-cache',
        'TE': 'Trailers',
        'Pragma': 'no-cache',
    })

    try:
        data = urlopen(request, timeout=5, context=ssl._create_unverified_context())
        page = bs(data, "html.parser")
    except socket.timeout:
        log('Socket timeout while fetching tweets from hashtag: #{}'.format(term))
        safe_sleep(1)
        return _fetch_tweets_from_html(term, since, until)
    tweets = page.find_all('li', {"data-item-type": "tweet"})
    tweet_list = [int(tweet['data-item-id']) for tweet in tweets if tweet.has_attr('data-item-id')]
    return tweet_list
Exemple #4
0
def monitor_progress():
    time.sleep(MONITORING_DELAY_IN_SECONDS)
    while True:
        if not global_errors.empty():
            thread, error = global_errors.get()
            log('ERROR OCCURED IN THREAD: {}'.format(thread))
            manage_exception(error)
        if global_process.memory_info()[0] // 1000000 > MAX_RAM_USAGE_LIMIT_IN_MEGABYTE:
            raise MaxRAMUsageLimitReachedException
        display_jobs_statuses()
        time.sleep(MONITORING_DELAY_IN_SECONDS)
def harvest_twitter_hashtag(twitter_hashtag_harvester):
    twitter_hashtag = twitter_hashtag_harvester.twitter_hashtag
    log(twitter_hashtag)
    while True:
        monitor_stop_flag()

        tweet_ids = _fetch_tweets_from_html(
            twitter_hashtag.term,
            twitter_hashtag_harvester.harvest_since,
            twitter_hashtag_harvester.harvest_until
        )
        pretty(tweet_ids)
    def generate_tasks(self):
        twitter_user_harvesters = _get_twitter_user_list()

        twitter_user_harvesters_count = twitter_user_harvesters.count()
        if twitter_user_harvesters_count:
            log("{}/{} Twitter users to tweet-harvest".format(
                twitter_user_harvesters_count,
                ItemHarvester.objects.filter(
                    twitter_user__isnull=False).count()))

        for twitter_user in twitter_user_harvesters:
            yield harvest_twitter_user, [twitter_user]
Exemple #7
0
def update_tweet_from_response(tweet_response):
    tweet, new = Tweet.objects.get_or_create(_ident=tweet_response.id)
    try:
        tweet.UpdateFromResponse(tweet_response._json)
    except TWUser.DoesNotExist:
        log("tweet #{}'s user does not exists!".format(tweet._ident))
        tweet.user = None
        tweet.save()

    if tweet.user.harvested_by:
        tweet._update_frequency = 1
    else:
        tweet._update_frequency = 5
Exemple #8
0
    def setInReplyToUser(self, **kwargs):
        try:
            twuser, new = get_from_any_or_create(TWUser, **kwargs)
        except:
            log('kwargs: %s' % kwargs)
            doubles = TWUser.objects.filter(**kwargs)
            doubles[0]._has_duplicate = True
            doubles[0].save()
            log('TWUSER %s HAS %s DUPLICATES!' %
                (doubles[0], doubles.count() - 1))
            time.sleep(3)
            raise

        self.in_reply_to_user = twuser
Exemple #9
0
 def execute(self):
     try:
         log('New job started.\n\n')
         log('Running job: "{}"'.format(self.name))
         generate_consumers()
         generate_producers()
         monitor_progress()
         log('Job "{}" has completed.'.format(self.name))
     except MaxRAMUsageLimitReachedException:
         logError("Max RAM usage limit reached {} Mb. Restarting".format(
             MAX_RAM_USAGE_LIMIT_IN_MEGABYTE
         ))
         end_threads()
         global_task_queue.clear()
         global_thread_stop_flag[0] = False
         time.sleep(5)
         # return self.execute()
     except Exception:
         end_threads()
         msg = "An unknown exception occured while harvesting data."
         logError(msg)
         if DEBUG:
             raise
         else:
             mail_log('Aspira - Harvest Unknown Error', msg)
     log('harvest ended')
 def updateStatistics(self, jObject):
     for attrName in self.statistics:
         countObjs = getattr(self, attrName).order_by('-recorded_time')
         objType = countObjs.model
         val = jObject
         for key in self.statistics[attrName]:
             if key in val:
                 val = val[key]
             else:
                 log('Invalid dict sequence: %s' %
                     self.statistics[attrName])
         if not countObjs.exists():
             objType.objects.create(comment=self, value=val)
         else:
             if countObjs[0].value != int(
                     val) and countObjs[0].recorded_time != today():
                 objType.objects.create(comment=self, value=val)
def update_twitter_users(twitter_user_batch):
    if not twitter_user_batch.count():
        return

    twitter_user_batch = list(twitter_user_batch)
    client = get_client('lookup_users')
    try:
        responses = client.call(
            'lookup_users',
            user_ids=[user._ident for user in twitter_user_batch])
    except tweepy.error.TweepError:
        log('got tweepy.error.TweepError!')
        log('user_ids = %s' % [user._ident for user in twitter_user_batch])
        return_client(client)
        raise
    return_client(client)

    for response in responses:
        monitor_stop_flag()
        tw_user = next((user for user in twitter_user_batch
                        if user._ident == response._json['id']), None)
        if tw_user:
            global_task_queue.add(update_twitter_user_from_response,
                                  args=[tw_user, response._json])
            twitter_user_batch.remove(tw_user)
    for tw_user in twitter_user_batch:
        log('Twitter user (%s) has returned no result.' % tw_user)
        # twUser._error_on_update = True
        tw_user._last_updated = today()
        tw_user._update_frequency = 5
        tw_user.save()
Exemple #12
0
def update_tweets(tweet_batch):
    if not tweet_batch.count():
        return

    tweet_batch = list(tweet_batch)
    client = get_client('statuses_lookup')
    try:
        responses = client.call('statuses_lookup',
                                id_=[tweet._ident for tweet in tweet_batch],
                                trim_user=True)
    except tweepy.error.TweepError:
        log('got tweepy.error.TweepError!')
        log('tweet ids = %s' % [tweet._ident for tweet in tweet_batch])
        return_client(client)
        raise
    return_client(client)

    for response in responses:
        monitor_stop_flag()
        tweet = next(
            (tweet
             for tweet in tweet_batch if tweet._ident == response._json['id']),
            None)
        if tweet:
            global_task_queue.add(update_tweet_from_response, [response])
            tweet_batch.remove(tweet)

    deleted_count = 0
    for tweet in tweet_batch:
        deleted_count += 1
        tweet.deleted_at = today()
        tweet.save()
    if deleted_count > 0:
        log("{} tweets have been deleted".format(deleted_count))
 def run(self):
     log('%s has started' % self.name)
     try:
         while True:
             monitor_stop_flag()
             self.execute()
     except NonFatalExeption:
         logError(
             "({}) has encountered a non-fatal error. Relaunching in {} "
             "seconds".format(
                 self.name,
                 self.relaunch_delay_in_seconds
             )
         )
         safe_sleep(self.relaunch_delay_in_seconds)
         return self.run()
     except GlobalStopFlag:
         log("Thread ended gracefully.")
         return
     except Exception as e:
         global_errors.put((self, e))
         return
Exemple #14
0
def end_threads():
    log('Ending all threads.')
    global_thread_stop_flag[0] = True
    for thread in threads_list[0]:
        if thread.is_alive():
            log('Joining thread %s' % thread.name)
            thread.join(timeout=3)
    log('Successfully joined all threads')
Exemple #15
0
def joinTWUsers(user1, user2):
    if user2.screen_name:
        user1.screen_name = user2.screen_name
    if user2._ident:
        user1._ident = user2._ident
    for label in [
            'screen_names',
            'names',
            'time_zones',
            'urls',
            'descriptions',
            'statuses_counts',
            'favourites_counts',
            'followers_counts',
            'friends_counts',
            'listed_counts',
    ]:
        log('transfering all %s from %s to %s' % (label, user2, user1))
        for item in getattr(user2, label).all():
            item.twuser = user1
            item.save()
    user1.save()
    user2.delete()
    return user1
Exemple #16
0
def clear_twitter_client_queue():
    log('Clearing Twitter Clients queue')
    while not clients_queue.empty():
        clients_queue.get()
 def clear(self):
     with self._mutex:
         for q in self._tasks_queues.values():
             del q
     log("Cleared the tasks queues.")
 def truncate_text(self):
     if not self.text:
         self.text = ""
     if len(self.text) >= self._text_max_length:
         self.text = self.text[0:self._text_max_length - 3] + '...'
         log('%s\'s text has been truncated!' % self)