def scrape_sources_for_events(sources): fbl = fb_mapreduce.get_fblookup() fbl.allow_cache = False discovered_list = thing_scraper.discover_events_from_sources(fbl, sources) for x in discovered_list: state = (x.event_id, x.source_id, x.source_field, x.extra_source_id) mr.increment('found-event-to-check') yield (_shard_for(x.event_id), json.dumps(state))
def classify_events(fbl, pe_list, fb_list): results = [] for pe, fb_event in zip(pe_list, fb_list): if fb_event and fb_event['empty']: fb_event = None # Get these past events out of the way, saved, then continue. # Next time through this mapreduce, we shouldn't need to process them. if pe.set_past_event(fb_event): pe.put() if not fb_event: continue # Don't process events we've already looked at, or don't need to look at. # This doesn't happen with the mapreduce that pre-filters them out, # but it does happen when we scrape users potential events and throw them all in here. if not pe.should_look_at or pe.looked_at: continue classified_event = event_classifier.classified_event_from_fb_event(fb_event) classified_event.classify() auto_add_result = event_auto_classifier.is_auto_add_event(classified_event) if auto_add_result[0]: logging.info("Found event %s, looking up location", pe.fb_event_id) location_info = event_locations.LocationInfo(fb_event) result = '+%s\n' % '\t'.join(unicode(x) for x in (pe.fb_event_id, True, location_info.final_city, location_info.final_city is not None, location_info.fb_address, fb_event['info'].get('name', ''))) try: logging.info('VTFI %s: Adding event %s, due to pe-invite-ids: %s', pe.fb_event_id, pe.fb_event_id, pe.get_invite_uids()) add_entities.add_update_event(fb_event, fbl, visible_to_fb_uids=pe.get_invite_uids(), creating_method=eventdata.CM_AUTO) pe2 = potential_events.PotentialEvent.get_by_key_name(pe.fb_event_id) pe2.looked_at = True pe2.auto_looked_at = True pe2.put() # TODO(lambert): handle un-add-able events differently results.append(result) mr.increment('auto-added-dance-events') except fb_api.NoFetchedDataException as e: logging.error("Error adding event %s, no fetched data: %s", pe.fb_event_id, e) except add_entities.AddEventException as e: logging.warning("Error adding event %s, no fetched data: %s", pe.fb_event_id, e) auto_notadd_result = event_auto_classifier.is_auto_notadd_event(classified_event, auto_add_result=auto_add_result) if auto_notadd_result[0]: pe2 = potential_events.PotentialEvent.get_by_key_name(pe.fb_event_id) pe2.looked_at = True pe2.auto_looked_at = True pe2.put() result = '-%s\n' % '\t'.join(unicode(x) for x in (pe.fb_event_id, fb_event['info'].get('name', ''))) results.append(result) mr.increment('auto-notadded-dance-events') return results
def yield_maybe_delete_bad_event(fbl, db_event): ctx = context.get() if ctx: params = ctx.mapreduce_spec.mapper.params allow_deletes = params['allow_deletes'] else: allow_deletes = False if db_event.creating_method not in [eventdata.CM_AUTO_ATTENDEE, eventdata.CM_AUTO]: return if db_event.fb_event['empty']: return import datetime # This is when we started adding all sorts of "crap" if not db_event.creation_time or db_event.creation_time < datetime.datetime(2016, 3, 5): return logging.info('MDBE: Check on event %s: %s', db_event.id, db_event.creating_method) from event_scraper import auto_add from nlp import event_classifier classified_event = event_classifier.get_classified_event(db_event.fb_event) good_text_event = auto_add.is_good_event_by_text(db_event.fb_event, classified_event) if good_text_event: if db_event.creating_method != eventdata.CM_AUTO: db_event.creating_method = eventdata.CM_AUTO yield op.db.Put(db_event) else: good_event = event_attendee_classifier.is_good_event_by_attendees(fbl, db_event.fb_event, classified_event=classified_event) if good_event: if db_event.creating_method != eventdata.CM_AUTO_ATTENDEE: db_event.creating_method = eventdata.CM_AUTO_ATTENDEE yield op.db.Put(db_event) else: logging.info( 'Accidentally %s added event %s: %s: %s', db_event.creating_method, db_event.fb_event_id, db_event.country, db_event.name ) mr.increment('deleting-bad-event') result = '%s: %s: %s: %s\n' % (db_event.fb_event_id, db_event.creating_method, db_event.country, db_event.name) yield result.encode('utf-8') if allow_deletes: from search import search search.delete_from_fulltext_search_index(db_event.fb_event_id) yield op.db.Delete(db_event) display_event = search.DisplayEvent.get_by_id(db_event.fb_event_id) if display_event: yield op.db.Delete(display_event)
def count_private_events(fbl, e_list): for e in e_list: try: fbe = e.fb_event if 'info' not in fbe: logging.error("skipping row2 for event id %s", e.fb_event_id) continue attendees = fb_api.get_all_members_count(fbe) if not fb_events.is_public(fbe) and fb_events.is_public_ish(fbe): mr.increment('nonpublic-and-large') privacy = fbe['info'].get('privacy', 'UNKNOWN') mr.increment('privacy-%s' % privacy) start_date = e.start_time.strftime('%Y-%m-%d') if e.start_time else '' yield '%s\n' % '\t'.join(str(x) for x in [e.fb_event_id, start_date, privacy, attendees]) except fb_api.NoFetchedDataException: logging.error("skipping row for event id %s", e.fb_event_id)
def count_private_events(fbl, e_list): for e in e_list: try: fbe = e.fb_event if 'info' not in fbe: logging.error("skipping row2 for event id %s", e.fb_event_id) continue attendees = fb_api.get_all_members_count(fbe) privacy = fbe['info'].get('privacy', 'OPEN') if privacy != 'OPEN' and attendees > 60: mr.increment('nonpublic-and-large') mr.increment('privacy-%s' % privacy) start_date = e.start_time.strftime( '%Y-%m-%d') if e.start_time else '' yield '%s\n' % '\t'.join( str(x) for x in [e.fb_event_id, start_date, privacy, attendees]) except fb_api.NoFetchedDataException: logging.error("skipping row for event id %s", e.fb_event_id)
def really_classify_events(fbl, new_pe_list, new_fb_list, allow_posting=True): if not new_pe_list: new_pe_list = [None] * len(new_fb_list) logging.info('Filtering out already-added events and others, have %s remaining events to run the classifier on', len(new_fb_list)) fb_event_ids = [x['info']['id'] for x in new_fb_list] fb_attending_maybe_list = fbl.get_multi(fb_api.LookupEventAttendingMaybe, fb_event_ids, allow_fail=True) results = [] for pe, fb_event, fb_event_attending_maybe in zip(new_pe_list, new_fb_list, fb_attending_maybe_list): event_id = fb_event['info']['id'] logging.info('Is Good Event By Text: %s: Checking...', event_id) classified_event = event_classifier.get_classified_event(fb_event) auto_add_result = event_auto_classifier.is_auto_add_event(classified_event) logging.info('Is Good Event By Text: %s: %s', event_id, auto_add_result) good_event = False if auto_add_result and auto_add_result[0]: good_event = auto_add_result[0] method = eventdata.CM_AUTO elif fb_event_attending_maybe: logging.info('Is Good Event By Attendees: %s: Checking...', event_id) good_event = event_attendee_classifier.is_good_event_by_attendees( fbl, fb_event, fb_event_attending_maybe=fb_event_attending_maybe, classified_event=classified_event ) logging.info('Is Good Event By Attendees: %s: %s', event_id, good_event) method = eventdata.CM_AUTO_ATTENDEE if good_event: result = '+%s\n' % '\t'.join((event_id, fb_event['info'].get('name', ''))) try: invite_ids = pe.get_invite_uids() if pe else [] logging.info('VTFI %s: Adding event %s, due to pe-invite-ids: %s', event_id, event_id, invite_ids) e = add_entities.add_update_event( fb_event, fbl, visible_to_fb_uids=invite_ids, creating_method=method, allow_posting=allow_posting ) pe2 = potential_events.PotentialEvent.get_by_key_name(event_id) pe2.looked_at = True pe2.auto_looked_at = True pe2.put() # TODO(lambert): handle un-add-able events differently results.append(result) mr.increment('auto-added-dance-events') if e.start_time < datetime.datetime.now(): mr.increment('auto-added-dance-events-past') mr.increment('auto-added-dance-events-past-eventid-%s' % event_id) else: mr.increment('auto-added-dance-events-future') except fb_api.NoFetchedDataException as e: logging.error("Error adding event %s, no fetched data: %s", event_id, e) except add_entities.AddEventException as e: logging.warning("Error adding event %s, no fetched data: %s", event_id, e) return results
def classify_events(fbl, pe_list, fb_list): new_pe_list = [] new_fb_list = [] # Go through and find all potential events we actually want to attempt to classify for pe, fb_event in zip(pe_list, fb_list): # Get these past events out of the way, saved, then continue. # Next time through this mapreduce, we shouldn't need to process them. if pe.set_past_event(fb_event): pe.put() if not fb_event or fb_event['empty']: mr.increment('skip-due-to-empty') continue # Don't process events we've already looked at, or don't need to look at. # This doesn't happen with the mapreduce that pre-filters them out, # but it does happen when we scrape users potential events and throw them all in here. if pe.looked_at: logging.info('Already looked at event (added, or manually discarded), so no need to re-process.') mr.increment('skip-due-to-looked-at') continue event_id = pe.fb_event_id if not re.match(r'^\d+$', event_id): logging.error('Found a very strange potential event id: %s', event_id) mr.increment('skip-due-to-bad-id') continue new_pe_list.append(pe) new_fb_list.append(fb_event) return really_classify_events(fbl, new_pe_list, new_fb_list)
def get_json(self, **kwargs): mr.increment('gmaps-api-%s' % self.name) if self.use_private_key: kwargs['client'] = 'free-dancedeets' unsigned_url_path = "%s?%s" % (self.path, urls.urlencode(kwargs)) private_key = google_maps_private_key decoded_key = base64.urlsafe_b64decode(private_key) signature = hmac.new(decoded_key, unsigned_url_path, hashlib.sha1) encoded_signature = base64.urlsafe_b64encode(signature.digest()) url = "%s%s&signature=%s" % (self.protocol_host, unsigned_url_path, encoded_signature) else: unsigned_url_path = "%s?%s" % (self.path, urls.urlencode(kwargs)) url = "%s%s&key=%s" % (self.protocol_host, unsigned_url_path, google_server_key) logging.info('geocoding url: %s', url) result = urllib.urlopen(url).read() logging.info('geocoding results: %s', result) try: return json.loads(result) except ValueError: return None
def track_lookup(cls): mr.increment('fb-lookups-source', 1)
def update_mailchimp(user): ctx = context.get() mailchimp_list_id = -1 if ctx: params = ctx.mapreduce_spec.mapper.params mailchimp_list_id = params.get('mailchimp_list_id', mailchimp_list_id) if mailchimp_list_id == -1: mailchimp_list_id = mailchimp_api.LIST_ID trimmed_locale = user.locale or '' if '_' in trimmed_locale: trimmed_locale = trimmed_locale.split('_')[0] if not user.email: mr.increment('mailchimp-error-no-email') logging.info('No email for user %s: %s', user.fb_uid, user.full_name) return if user.mailchimp_email != user.email: # When some old users are saved, their mailchimp email will be None, # so we don't really need to worry about them here. logging.info('Updating user email to %s with old mailchimp email %s', user.email, user.mailchimp_email) if user.mailchimp_email != None: mr.increment('mailchimp-update-email-error-response') try: user_data = mailchimp_api.update_email(mailchimp_api.LIST_ID, user.mailchimp_email, user.email) except mailchimp_api.UserNotFound: mr.increment('mailchimp-update-email-error-not-found') logging.error('Updating user %s email to mailchimp, returned not found', user.fb_uid) else: logging.info('Result: %s', user_data) if user_data['email_address'] == user.email: logging.info('Updating user %s email to mailchimp, returned OK', user.fb_uid) else: mr.increment('mailchimp-update-email-error-response') logging.error('Updating user %s email to mailchimp, returned %s', user.fb_uid, user_data) # Mark our current mailchimp_email down, so we can update it properly later if desired. user.mailchimp_email = user.email # Now that Mailchimp knows about our new user email, # we can update/reference it using the normal add_members() below. member = { 'email_address': user.email, # Mailchimp is the official store of 'are they subscribed', so let's not overwrite it here 'status_if_new': 'subscribed', 'language': trimmed_locale, 'merge_fields': { 'USER_ID': user.fb_uid, # necessary so we can update our local datastore on callbacks 'FIRSTNAME': user.first_name or '', 'LASTNAME': user.last_name or '', 'FULLNAME': user.full_name or '', 'NAME': user.first_name or user.full_name or '', 'WEEKLY': unicode(user.send_email), 'EXPIRED': unicode(user.expired_oauth_token), 'LASTLOGIN': user.last_login_time.strftime('%Y-%m-%d') if user.last_login_time else '', }, 'timestamp_signup': user.creation_time.strftime('%Y-%m-%dT%H:%M:%S'), 'timestamp_opt': user.creation_time.strftime('%Y-%m-%dT%H:%M:%S'), } if user.location: geocode = gmaps_api.lookup_address(user.location) if geocode: user_latlong = geocode.latlng() member['location'] = { 'latitude': user_latlong[0], 'longitude': user_latlong[1], } else: logging.warning('User %s (%s) had un-geocodable address: %s', user.fb_uid, user.full_name, user.location) mr.increment('mailchimp-api-call') result = mailchimp_api.add_members(mailchimp_list_id, [member]) if result['errors']: mr.increment('mailchimp-error-response') logging.error('Writing user %s to mailchimp returned %s on input: %s', user.fb_uid, result['errors'], member) else: logging.info('Writing user %s to mailchimp returned OK', user.fb_uid)
def track_lookup(cls): mr.increment('fb-lookups-comments')
def track_lookup(cls): mr.increment('fb-lookups-user-events', 3)
def track_lookup(cls): mr.increment('fb-lookups-user')
def track_lookup(cls): mr.increment('fb-lookups-profile')
def classify_events(fbl, pe_list, fb_list): results = [] for pe, fb_event in zip(pe_list, fb_list): if fb_event and fb_event['empty']: fb_event = None # Get these past events out of the way, saved, then continue. # Next time through this mapreduce, we shouldn't need to process them. if pe.set_past_event(fb_event): pe.put() if not fb_event: continue # Don't process events we've already looked at, or don't need to look at. # This doesn't happen with the mapreduce that pre-filters them out, # but it does happen when we scrape users potential events and throw them all in here. if not pe.should_look_at or pe.looked_at: continue classified_event = event_classifier.classified_event_from_fb_event( fb_event) classified_event.classify() auto_add_result = event_auto_classifier.is_auto_add_event( classified_event) if auto_add_result[0]: logging.info("Found event %s, looking up location", pe.fb_event_id) location_info = event_locations.LocationInfo(fb_event) result = '+%s\n' % '\t'.join( unicode(x) for x in (pe.fb_event_id, location_info.exact_from_event, location_info.final_city, location_info.final_city is not None, location_info.fb_address, fb_event['info'].get('name', ''))) try: add_entities.add_update_event( fb_event, fbl, visible_to_fb_uids=pe.get_invite_uids(), creating_method=eventdata.CM_AUTO) pe2 = potential_events.PotentialEvent.get_by_key_name( pe.fb_event_id) pe2.looked_at = True pe2.auto_looked_at = True pe2.put() # TODO(lambert): handle un-add-able events differently results.append(result) mr.increment('auto-added-dance-events') except fb_api.NoFetchedDataException as e: logging.error("Error adding event %s, no fetched data: %s", pe.fb_event_id, e) except add_entities.AddEventException as e: logging.warning("Error adding event %s, no fetched data: %s", pe.fb_event_id, e) auto_notadd_result = event_auto_classifier.is_auto_notadd_event( classified_event, auto_add_result=auto_add_result) if auto_notadd_result[0]: pe2 = potential_events.PotentialEvent.get_by_key_name( pe.fb_event_id) pe2.looked_at = True pe2.auto_looked_at = True pe2.put() result = '-%s\n' % '\t'.join( unicode(x) for x in (pe.fb_event_id, fb_event['info'].get('name', ''))) results.append(result) mr.increment('auto-notadded-dance-events') return results
def track_lookup(cls): mr.increment('fb-lookups-event-rsvp', 4)
def track_lookup(cls): mr.increment('fb-lookups-search-events')