def search_term(harvester, snh_search): if debugging: dLogger.log("search_term()") new_statuses_list = [] last_harvested_status = snh_search.latest_status_harvested max_id = None if last_harvested_status: max_id = int(last_harvested_status.fid) - 1 if debugging: dLogger.log(' Latest statuse harvested date: %s' % last_harvested_status.created_at) while len(new_statuses_list) < 80: status_id_list = collect_tweets_from_html(harvester, snh_search, max_id) if len(status_id_list) > 0: new_statuses_list += status_id_list max_id = int(status_id_list[-1]) - 1 else: break if len(new_statuses_list) == 0: return None return new_statuses_list
def collect_tweets_from_html(harvester, snh_search, max_id=None): if debugging: dLogger.log('collect_tweets_from_html()') #dLogger.log(' snh_search: %s'%snh_search) #dLogger.log(' max_id: %s'%max_id) since = datetime.strftime(harvester.harvest_window_from, '%Y-%m-%d') until = datetime.strftime(harvester.harvest_window_to, '%Y-%m-%d') query = snh_search.term.encode('utf-8') params = '%s since:%s until:%s' % (query, since, until) if max_id: params += ' max_id:%s' % max_id strUrl = 'https://twitter.com/hashtag/' + urllib.quote(params) if debugging: dLogger.log(' URL: %s' % strUrl) url = urllib2.Request(strUrl, headers={'User-Agent': 'Mozilla/5.0'}) try: data = urllib2.urlopen(url, timeout=5) except: time.sleep(1) data = urllib2.urlopen(url, timeout=5) page = bs(data, "html.parser") tweetBox = page.find('ol', id='stream-items-id') tweets = tweetBox.findAll('li') status_id_list = [] for tweet in tweets: if tweet.has_attr('data-item-id'): status_id_list.append(tweet['data-item-id']) return status_id_list
def update_comment_status(self, comment, post): if debugging: dLogger.log("<ThreadComment#%s>::update_comment_status()" % self.ident) #dLogger.log(" comment: %s"%comment) #dLogger.log(" message: %s"%unicode(comment['message'])) fbcomment = None try: try: fbcomment = FBComment.objects.get(fid=comment["id"]) except ObjectDoesNotExist: fbcomment = FBComment(post=post) fbcomment.save() fbcomment.update_from_facebook(comment, post) except IntegrityError: try: fbcomment = FBComment.objects.get(fid=comment["id"]) fbcomment.update_from_facebook(comment, post) except ObjectDoesNotExist: msg = u"ERROR! Comments already exist but not found%s for %s" % ( unicode(comment), post.fid if post.fid else "0") logger.exception(msg) if debugging: dLogger.exception(msg) except: msg = u"<p style='red'>Cannot update comment %s for %s</p>" % ( unicode(comment), post.fid if post.fid else "0") logger.exception(msg) if debugging: dLogger.exception(msg) return fbcomment
def update_user_batch(harvester): if debugging: dLogger.log("update_user_batch()") #get 10000 unupdated FBUsers all_users = harvester.fbusers_to_harvest.filter( pk__gt=harvester.dont_harvest_further_than) if all_users.count() == 0: harvester.dont_harvest_further_than = 0 harvester.save() all_users = harvester.fbusers_to_harvest.filter( pk__gt=harvester.dont_harvest_further_than) all_users = all_users.filter(pk__lt=harvester.dont_harvest_further_than + 10000) batch_man = [] for snhuser in all_users: if not snhuser.error_triggered: uid = snhuser.fid if snhuser.fid else snhuser.username #if debugging: dLogger.log(" uid: %s"%uid) d = {"method": "GET", "relative_url": str(uid)} #if debugging: dLogger.log(" d: %s"%d) batch_man.append({ "snh_obj": snhuser, "retry": 0, "request": d, "callback": update_user_from_batch }) else: logger.info( u"Skipping user update: %s(%s) because user has triggered the error flag." % (unicode(snhuser), snhuser.fid if snhuser.fid else "0")) generic_batch_processor_v2(harvester, batch_man)
def update_user_statuses_batch(harvester): if debugging: dLogger.log("update_user_statuses_batch()") all_users = harvester.fbusers_to_harvest.all() batch_man = [] for snhuser in all_users: if not snhuser.error_triggered: uid = snhuser.fid if snhuser.fid else snhuser.username d = { "method": "GET", "relative_url": str("%s/feed?limit=250&fields=comments.limit(0).summary(true),\ likes.limit(0).summary(true),shares,message,message_tags,name,caption,description,properties,privacy,type,\ place,story,story_tags,object_id,application,updated_time,picture,link,source,icon,from" % str(uid)) } #if debugging: dLogger.log(" d: %s"%d) batch_man.append({ "snh_obj": snhuser, "retry": 0, "request": d, "callback": update_user_feed_from_batch }) else: logger.info( u"Skipping status update: %s(%s) because user has triggered the error flag." % (unicode(snhuser), snhuser.fid if snhuser.fid else "0")) #usage = psutil.virtual_memory() logger.info(u"Will harvest statuses for %s" % (harvester)) generic_batch_processor_v2(harvester, batch_man)
def update_user_fk(self, self_prop, face_prop, facebook_model): #if debugging: dLogger.log("<FBComment: %s>::update_user_fk()"%self.fid) model_changed = False if face_prop in facebook_model: prop_val = facebook_model[face_prop] if prop_val and (self_prop is None or self_prop.fid != prop_val["id"]): user = None user = self.get_existing_user({"fid__exact":prop_val["id"]}) if not user: try: user = FBUser() user.update_from_facebook(prop_val) if debugging: dLogger.log(" new user created: %s"%user) except IntegrityError: user = self.get_existing_user({"fid__exact":prop_val["id"]}) if user: user.update_from_facebook(prop_val) else: logger.debug(u">>>>CRITICAL CANT UPDATED DUPLICATED USER %s" % prop_val["id"]) self_prop = user model_changed = True return model_changed, self_prop
def custom_migration(): params = [ 'harvester_type', 'client', 'tt_client', 'consumer_key', 'consumer_secret', 'access_token_key', 'access_token_secret', 'remaining_search_hits', 'remaining_user_timeline_hits', 'remaining_user_lookup_hits', 'reset_time_in_seconds', 'hourly_limit', 'reset_time', #'twusers_to_harvest', #'twsearch_to_harvest', 'last_harvested_user', 'current_harvested_user', 'last_updated_user', 'current_updated_user', ] for harv2 in TwitterHarvester2.objects.all(): harv = TwitterHarvester.objects.create() for param in params: dLogger.log('param: %s' % param) setattr(harv, param, getattr(harv2, param)) harv.save() if debugging: dLogger.log('copied %s' % harv)
def sort_harvesters_by_priority(all_harvesters): if debugging: dLogger.log("sort_harvesters_by_priority()") new_harvesters = [ harv for harv in all_harvesters if harv.last_harvest_start_time == None ] aborted_harvesters = [ harv for harv in all_harvesters if harv.current_harvest_start_time != None and harv not in new_harvesters ] clean_harvesters = [ harv for harv in all_harvesters if harv not in aborted_harvesters and harv not in new_harvesters ] sorted_harvester_list = new_harvesters sorted_harvester_list += sorted( clean_harvesters, key=lambda harvester: harvester.last_harvest_start_time) sorted_harvester_list += sorted( aborted_harvesters, key=lambda harvester: harvester.current_harvest_start_time) if debugging: dLogger.log(' sorted_harvester_list: %s' % sorted_harvester_list) return sorted_harvester_list
def get_stats(self): if debugging: dLogger.log("get_stats()") dLogger.log( " remaining_hits (search)(timeline)(user): (%s)(%s)(%s)" % (self.remaining_search_hits, self.remaining_user_timeline_hits, self.remaining_user_lookup_hits)) dLogger.log(" reset_time: %s" % self.reset_time) dLogger.log(" last_harvested_user: %s" % self.last_harvested_user) dLogger.log(" current_harvested_user: %s" % self.last_harvested_user) parent_stats = super(TwitterHarvester, self).get_stats() parent_stats["concrete"] = { "remaining_hits (search)(timeline)(user)": (self.remaining_search_hits, self.remaining_user_timeline_hits, self.remaining_user_lookup_hits), "reset_time_in_seconds": self.reset_time_in_seconds, "hourly_limit": self.hourly_limit, "reset_time": self.reset_time, "last_harvested_user": unicode(self.last_harvested_user), "current_harvested_user": unicode(self.current_harvested_user), } return parent_stats
def get_tw_harvester_status_list(request, call_type, harvester_id): dLogger.log('get_tw_harvester_status_list()') querySet = None columnIndexNameMap = { 0 : u'created_at', 1 : u'fid', 2 : u'text', 3 : u'retweet_count', 4 : u'retweeted', 5 : u'source', } if harvester_id == '0': querySet = TWStatus.objects.all() else: harvester = get_list_or_404(TwitterHarvester, pmk_id=harvester_id)[0] if harvester.twusers_to_harvest.count() > 100: return twUserAjaxTableError("Too many items to display") # merge two conditional filter in queryset: conditionList = [Q(user=user) for user in harvester.twusers_to_harvest.all()] conditionList += [Q(TWSearch_hit=search) for search in harvester.twsearch_to_harvest.all()] querySet = TWStatus.objects.filter(reduce(lambda x, y: x | y, conditionList)).distinct() #call to generic function from utils return get_datatables_records(request, querySet, columnIndexNameMap, call_type)
def update_client_stats(self): """updates the remaining api calls the instance is allowed to do before annoying Twitter """ if debugging: dLogger.log("update_client_stats()") c = self.get_client() rates = c.GetRateLimitStatus("search,statuses,users")["resources"] searchRates = rates["search"]["/search/tweets"] userLookupRates = rates["users"]["/users/lookup"] statusTimelineRates = rates["statuses"]["/statuses/user_timeline"] self.remaining_search_hits = rates["statuses"]["/statuses/lookup"][ 'remaining'] self.remaining_user_timeline_hits = statusTimelineRates["remaining"] self.remaining_user_lookup_hits = userLookupRates["remaining"] self.reset_time_in_seconds = max( searchRates["reset"], userLookupRates["reset"], statusTimelineRates["reset"], ) self.hourly_limit = searchRates[ "limit"] # Since all three limits are the same self.reset_time = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(self.reset_time_in_seconds)) self.save()
def start_new_harvest(self): if debugging: dLogger.log('start_new_harvest()') self.current_harvest_start_time = datetime.now() self.current_harvest_call_count = 0 self.harvest_in_progress = True self.save()
def get_latest_status(self): if debugging: dLogger.log("get_latest_status()") latest_status = None statuses = TWStatus.objects.filter(user=self).order_by("created_at") for latest_status in statuses: break return latest_status
def gbp_core(harvester, bman_chunk, error_map, next_bman_list, failed_list): if debugging: dLogger.log("gbp_core()") #dLogger.log(" harvester: %s"%harvester) #dLogger.log(" bman_chunk: %s"%bman_chunk) #dLogger.log(" error_map: %s"%error_map) #dLogger.log(" next_bman_list: %s"%next_bman_list) #dLogger.log(" failed_list: %s"%failed_list) error = False try: urlized_batch = [ bman_chunk[j]["request"] for j in range(0, len(bman_chunk)) ] #if debugging: dLogger.log(" urlized_batch: %s"%urlized_batch) batch_result = harvester.api_call("request", { 'path': '', 'post_args': { "batch": urlized_batch } }) #dLogger.pretty(batch_result) for (counter, fbobj) in enumerate(batch_result): bman_obj = bman_chunk[counter] if type(fbobj) == dict: next = bman_obj["callback"](harvester, bman_obj["snh_obj"], fbobj) if next: next_bman_list += next else: e_code = gbp_error_man(bman_obj, fbobj) if e_code == E_UNEX: error = True error_map[e_code] = error_map[ e_code] + 1 if e_code in error_map else 0 if e_code in E_CRITICALS: failed_list.append(bman_obj) else: next_bman_list.append(bman_obj) except FacepyError, fex: e_code = gbp_facepyerror_man(fex, {"bman_chunk": bman_chunk}) if e_code == E_UNEX: error = True error_map[e_code] = error_map[e_code] + 1 if e_code in error_map else 0 if e_code in E_CRITICALS: msg = u"CRITICAL gbp_core: Unmanaged FacepyError error:%s. Aborting a full bman_chunk." % ( e_code) logger.exception(msg) failed_list += bman_chunk else: next_bman_list += bman_chunk
def get_tt_client(self): if debugging: dLogger.log("get_tt_client()") if not self.tt_client: self.tt_client = Twython(self.consumer_key, self.consumer_secret, self.access_token_key, self.access_token_secret) return self.tt_client
def run_twitter_harvester(): if debugging: dLogger.log("run_twitter_harvester()") #custom_export() #return harvester_list = sort_harvesters_by_priority( TwitterHarvester.objects.all()) for harvester in harvester_list: harvester.harvest_in_progress = False harvester.save() logger.info('Will run in order: %s' % harvester_list) try: for harvester in harvester_list: logger.info(u"The harvester %s is %s" % (unicode(harvester), "active" if harvester.is_active else "inactive")) if harvester.is_active: harvester.start_new_harvest() harvester.update_client_stats() if harvester.remaining_user_lookup_hits <= 0: warn = u"The harvester %s has exceeded the user lookup rate limit. Need to wait? %s" % ( unicode(harvester), harvester.get_stats()) logger.warning(warn) else: run_users_update(harvester) harvester.update_client_stats() if harvester.remaining_user_timeline_hits <= 0 and harvester.remaining_user_lookup_hits <= 0: warn = u"The harvester %s has exceeded the status rate limits. Need to wait? %s" % ( unicode(harvester), harvester.get_stats()) logger.warning(warn) else: run_harvester_timeline(harvester) harvester.update_client_stats() if harvester.remaining_search_hits <= 0: warn = u"The harvester %s has exceeded the search rate limit. Need to wait? %s" % ( unicode(harvester), harvester.get_stats()) logger.warning(warn) else: run_harvester_search(harvester) harvester.update_client_stats() harvester.end_current_harvest() if debugging: dLogger.log('Harvest has ended for all harvesters') except: for harvester in harvester_list: harvester.harvest_in_progress = False harvester.save() raise
def update_from_youtube(self, snh_video, snh_user, yt_comment): #Comment if debugging: dLogger.log("<YTComment: '%s'>::update_from_youtube()" % self) #dLogger.pretty(yt_comment) model_changed = False fid = yt_comment['id'] if self.fid != fid: self.fid = fid model_changed = True snippet = yt_comment['snippet'] if self.video != snh_video: self.video = snh_video model_changed = True if self.user != snh_user: self.user = snh_user model_changed = True yt_published = snippet['publishedAt'] date_val = datetime.strptime(yt_published[:-5], '%Y-%m-%dT%H:%M:%S') if self.published != date_val: self.published = date_val model_changed = True yt_updated = snippet['updatedAt'] date_val = datetime.strptime(yt_updated[:-5], '%Y-%m-%dT%H:%M:%S') if self.updated != date_val: self.updated = date_val model_changed = True content = snippet['textDisplay'].encode('unicode_escape') content = re.sub(r'\\\\x..', '', content) if self.message != content: self.message = content model_changed = True like_count = snippet['likeCount'] if self.like_count != like_count: self.like_count = like_count model_changed = True if model_changed: self.model_update_date = datetime.utcnow() try: self.save() except Exception, e: dLogger.log(' Error while saving:') dLogger.exception(e) dLogger.pretty(str(yt_comment).encode('unicode_escape'))
def manage_exception(retry_count, harvester, user): if debugging: dLogger.log( "manage_exception(retry_count: %s, harvester: %s, user: %s)" % (retry_count, harvester, user)) msg = u"Exception for the harvester %s for %s. Retry:%d" % ( harvester, unicode(user), retry_count) logger.exception(msg) if debugging: dLogger.exception(msg) retry_count += 1 return (retry_count, retry_count > harvester.max_retry_on_fail)
def end_current_harvest(self): if debugging: dLogger.log('start_new_harvest()') self.last_harvest_start_time = self.current_harvest_start_time self.last_harvest_end_time = datetime.now() self.current_harvest_start_time = None self.last_harvest_call_count = self.current_harvest_call_count self.last_user_harvest_was_aborted = bool( self.get_current_harvested_user()) self.harvest_in_progress = False self.save()
def update_user_comments_from_batch(harvester, statusid, fbcomments_page): #if debugging: #dLogger.log("update_user_comments_from_batch(statusid: %s)"%statusid) next_bman = [] #if "data" not in fbcomments_page: #logger.debug("DEVED: %s: %s" % (statusid, fbcomments_page)) comment_count = None fbcomments_page = json.loads(fbcomments_page['body']) if not 'error' in fbcomments_page: comment_count = len(fbcomments_page["data"]) else: logger.debug('ERROR: status %s could not be harvested: %s' % (statusid, fbcomments_page['error'])) if comment_count: waitCount = 0 for comment in fbcomments_page["data"]: res = FBResult() res.harvester = harvester res.result = comment res.ftype = "FBComment" res.fid = comment["id"] res.parent = statusid res.save() waitCount += 1 if debugging: dLogger.log(" %s more comments in waiting..." % waitCount) paging, new_page = get_comment_paging(fbcomments_page) #usage = psutil.virtual_memory() #logger.debug(u"Updating %d comments. New: %s Paging: %s Mem:%s MB" % (comment_count, new_page, paging, int(usage[4])/(1024.0))) if new_page: d = { "method": "GET", "relative_url": str("%s/comments?limit=250%s" % (statusid, paging)) } next_bman.append({ "snh_obj": statusid, "retry": 0, "request": d, "callback": update_user_comments_from_batch }) #else: # logger.debug("Empty comment page!! %s" % fbcomments_page) return next_bman
def update_user_status(status, user, keepRaw): #if debugging: dLogger.log( "update_user_status(status: '%s...', user: %s)"%(status.text[:60], user.screen_name)) try: tw_status = TWStatus.objects.get(fid__exact=status.id) except ObjectDoesNotExist: tw_status = TWStatus(user=user) tw_status.save() if debugging: dLogger.log(" New <TWStatus> created('%s...')" % (tw_status)) tw_status.update_from_twitter(status, user, keepRaw) user.last_harvested_status = tw_status user.save()
def generate_csv_stream(request, dataLength, data, filename='output.csv'): dLogger.log("generate_csv_stream()") def data(): for i in xrange(dataLength): csvfile = StringIO.StringIO() csvwriter = csv.writer(csvfile) csvwriter.writerow(data[i]) yield csvfile.getvalue() response = HttpResponse(data(), mimetype="text/csv") response["Content-Disposition"] = "attachment; filename=%s" % filename return response
def run_harvester_search(harvester): if debugging: dLogger.log("run_harvester_search(harvester: %s)" % (harvester)) logger.info(u"START SEARCH: %s Stats:%s" % (harvester, unicode(harvester.get_stats()))) try: all_twsearch = harvester.twsearch_to_harvest.all() search_all_terms(harvester, all_twsearch) except twitter.TwitterError, e: msg = u"ERROR for %s" % harvester logger.exception(msg) if debugging: dLogger.exception(msg)
def compute_results(harvester): if debugging: dLogger.log("compute_results()") dLogger.log(" %s items to analyze" % FBResult.objects.count()) if FBResult.objects.filter(harvester=harvester).count() != 0: start = time.time() logger.info(u"Starting results computation") compute_new_post(harvester) compute_new_comment(harvester) FBResult.objects.filter(harvester=harvester).delete() logger.info(u"Results computation complete in %ss" % (time.time() - start))
def build_harvester_sequence(self): if debugging: dLogger.log("build_harvester_sequence()") self.haverst_deque = deque() all_users = list(self.twusers_to_harvest.all()) if self.last_harvested_user: startIndex = all_users.index(self.last_harvested_user) retry_last_on_fail = 1 if self.retry_user_after_abortion and self.last_user_harvest_was_aborted else 0 self.haverst_deque.extend(all_users[startIndex + retry_last_on_fail:]) self.haverst_deque.extend(all_users[:startIndex + retry_last_on_fail]) else: self.haverst_deque.extend(all_users)
def update_user_from_batch(harvester, snhuser, fbuser): if debugging: dLogger.log("update_user_from_batch()") #dLogger.log("fbuser: %s"%fbuser) try: snhuser.update_from_facebook(fbuser) except BaseException: logger.info('update failed for user %s' % snhuser) snhuser.error_triggered = True snhuser.save() #Recycling an unused field to store the last updated user. Not a good solution. To be revised. harvester.dont_harvest_further_than = snhuser.pk harvester.save() return None
def get_next_user_to_harvest(self): if debugging: dLogger.log("%s::get_next_user_to_harvest()" % self) if self.current_harvested_user: self.last_harvested_user = self.current_harvested_user if self.haverst_deque is None: self.build_harvester_sequence() try: self.current_harvested_user = self.haverst_deque.pop() except IndexError: self.current_harvested_user = None self.update_client_stats() return self.current_harvested_user
def run_users_update(harvester): if debugging: dLogger.log("run_users_update(harvester: %s)" % (harvester)) logger.info(u"START user update: %s Stats:%s" % (harvester, unicode(harvester.get_stats()))) while harvester.remaining_user_lookup_hits > 0: logger.debug(u"New user batch to update. User lookup hits to go: %s" % (harvester.remaining_user_lookup_hits)) user_batch = harvester.get_next_user_batch_to_update() if user_batch: update_user_batch(harvester, user_batch) else: break logger.info(u"End user update for %s Stats:%s" % (harvester, unicode(harvester.get_stats())))
def fb(request, harvester_id): facebook_harvesters = FacebookHarvester.objects.all() dLogger.log('facebook_harvesters: %s' % facebook_harvesters) return render_to_response( u'snh/facebook.html', { u'fb_selected': True, u'all_harvesters': facebook_harvesters, u'harvester_id': harvester_id, 'status_fields': izip_longest(*fb_posts_fields), 'comment_fields': izip_longest(*fb_comments_fields), 'years': choiceYears, 'months': choiceMonths, 'days': choiceDays, "now": now, })
def build_harvester_sequence(self): if debugging: dLogger.log("%s::build_harvester_sequence()" % self) self.haverst_deque = deque() all_users = self.ytusers_to_harvest.all() if self.last_harvested_user: count = 0 for user in all_users: if user == self.last_harvested_user: break count = count + 1 retry_last_on_fail = 1 if self.retry_user_after_abortion and self.last_user_harvest_was_aborted else 0 self.haverst_deque.extend(all_users[count + retry_last_on_fail:]) self.haverst_deque.extend(all_users[:count + retry_last_on_fail]) else: self.haverst_deque.extend(all_users)