Example #1
0
    def rebuild_from_query(cls, force=False):
        logging.info("Loading Index")
        if cls._is_ndb():
            db_query = cls.obj_type.query(
                *cls._get_query_params_for_indexing())
        else:
            db_query = cls.obj_type.all()
        object_keys = db_query.fetch(MAX_OBJECTS, keys_only=True)
        object_ids = set(cls._get_id(x) for x in object_keys)

        logging.info("Loaded %s objects for indexing", len(object_ids))
        if len(object_ids) >= MAX_OBJECTS:
            logging.critical(
                'Found %s objects. Increase the MAX_OBJECTS limit to search more events.',
                MAX_OBJECTS)

        doc_index = cls.real_index()

        docs_per_group = search.MAXIMUM_DOCUMENTS_PER_PUT_REQUEST

        logging.info("Deleting Expired docs")
        start_id = '0'
        doc_ids_to_delete = set()
        while True:
            doc_ids = [
                x.doc_id
                for x in doc_index.get_range(ids_only=True,
                                             start_id=start_id,
                                             include_start_object=False)
            ]
            if not doc_ids:
                break
            new_ids_to_delete = set(doc_ids).difference(object_ids)
            doc_ids_to_delete.update(new_ids_to_delete)
            logging.info(
                "Looking at %s doc_id candidates for deletion, will delete %s entries.",
                len(doc_ids), len(new_ids_to_delete))
            start_id = doc_ids[-1]
        if not force and len(doc_ids_to_delete) and len(
                doc_ids_to_delete) > len(object_ids) * cls.delete_threshold:
            logging.critical(
                "Deleting %s docs, more than %d%% of total %s docs",
                len(doc_ids_to_delete), cls.delete_threshold * 100,
                len(object_ids))
            return
        logging.info("Deleting %s docs", len(doc_ids_to_delete))
        cls.delete_ids(list(doc_ids_to_delete))

        # Add all events
        logging.info("Loading %s docs, in groups of %s", len(object_ids),
                     docs_per_group)
        object_ids_list = sorted(object_ids)
        for x in object_ids_list:
            logging.info('index: %s', x)
        for i in range(0, len(object_ids_list), docs_per_group):
            group_object_ids = object_ids_list[i:i + docs_per_group]
            deferred.defer(cls._save_ids, group_object_ids)
    def post(self):
        event_id = self.request.get('event_id')
        remapped_address = self.request.get('remapped_address')
        override_address = self.request.get('override_address')

        if self.request.get('delete'):
            e = eventdata.DBEvent.get_by_id(event_id)
            # This e will be None if the user submits a deletion-form twice
            if e:
                event_updates.delete_event(e)
            self.user.add_message("Event deleted!")
            return self.redirect('/events/admin_edit?event_id=%s' % event_id)

        # We could be looking at a potential event for something that is inaccessable to our admin.
        # So we want to grab the cached value here if possible, which should exist given the admin-edit flow.
        fb_event = get_fb_event(self.fbl, event_id)
        logging.info("Fetched fb_event %s", fb_event)
        if not fb_events.is_public_ish(fb_event):
            self.add_error('Cannot add secret/closed events to dancedeets!')
        self.errors_are_fatal()

        if self.request.get('background'):
            deferred.defer(
                add_entities.add_update_event,
                fb_event,
                self.fbl,
                creating_uid=self.user.fb_uid,
                remapped_address=remapped_address,
                override_address=override_address,
                creating_method=eventdata.CM_ADMIN
            )
            self.response.out.write("<title>Added!</title>Added!")
        else:
            try:
                add_entities.add_update_event(
                    fb_event,
                    self.fbl,
                    creating_uid=self.user.fb_uid,
                    remapped_address=remapped_address,
                    override_address=override_address,
                    creating_method=eventdata.CM_ADMIN
                )
            except Exception as e:
                logging.exception('Error adding event')
                self.add_error(str(e))
            self.errors_are_fatal()
            self.user.add_message("Changes saved!")
            return self.redirect('/events/admin_edit?event_id=%s' % event_id)
Example #3
0
def process_event_source_ids(discovered_list, fbl):
    # TODO(lambert): maybe trim any ids from posts with dates "past" the last time we scraped? tricky to get correct though
    logging.info("Loading processing %s discovered events", len(discovered_list))
    event_pipeline.process_discovered_events(fbl, discovered_list)

    # TODO: Should only run this code on events that we actually decide are worth adding
    if False:
        potential_new_source_ids = set([x.extra_source_id for x in discovered_list if x.extra_source_id])
        existing_source_ids = set([x.graph_id for x in thing_db.Source.get_by_key_name(potential_new_source_ids) if x])
        new_source_ids = set([x for x in potential_new_source_ids if x not in existing_source_ids])
        for source_id in new_source_ids:
            #TODO(lambert): we know it doesn't exist, why does create_source_from_id check datastore?
            s = thing_db.Source(key_name=source_id)
            s.put()
        logging.info("Found %s new sources", len(new_source_ids))

        # initiate an out-of-band-scrape for our new sources we found
        if new_source_ids:
            deferred.defer(scrape_events_from_source_ids, fbl, new_source_ids)
Example #4
0
def function_migrate_thing_to_new_id(fbapi_obj, old_source_id, new_source_id):
    old_source = thing_db.Source.get_by_key_name(old_source_id)

    # Maybe we got two of these and it already ran in parallel, so ignore this one
    if not old_source:
        return

    fbl = fb_api.FBLookup(None, fbapi_obj.access_token_list)

    fbl.fb.raise_on_page_redirect = True
    try:
        results = fbl.get(fb_api.LookupThingCommon, new_source_id)
    except fb_api.PageRedirectException as e:
        # If our forwarding address in turn has its own forwarding address,
        # repoint the old thing further down the chain
        deferred.defer(function_migrate_thing_to_new_id, fbl.fb, old_source_id,
                       e.to_id)
        return

    new_source = thing_db.create_source_from_id(fbl, new_source_id)
    new_source.creating_fb_uid = new_source.creating_fb_uid or old_source.creating_fb_uid
    new_source.creation_time = new_source.creation_time or old_source.creation_time
    new_source.last_scrape_time = new_source.last_scrape_time or old_source.last_scrape_time

    new_source.num_all_events = (new_source.num_all_events
                                 or 0) + (old_source.num_all_events or 0)
    new_source.num_potential_events = (new_source.num_potential_events
                                       or 0) + (old_source.num_potential_events
                                                or 0)
    new_source.num_real_events = (new_source.num_real_events
                                  or 0) + (old_source.num_real_events or 0)
    new_source.num_false_negatives = (new_source.num_false_negatives or 0) + (
        old_source.num_false_negatives or 0)

    # Who has pointers to sources??
    migrate_potential_events(old_source_id, new_source_id)

    new_source.put()
    old_source.delete()
Example #5
0
    def post(self):
        if self.json_body['scrapinghub_key'] != keys.get('scrapinghub_key'):
            self.response.status = 403
            return
        events_to_update = []
        new_ids = set()
        for json_body in self.json_body['items']:
            event_id = eventdata.DBEvent.generate_id(
                json_body['namespace'], json_body['namespaced_id'])
            e = eventdata.DBEvent.get_or_insert(event_id)
            if e.creating_method is None:
                new_ids.add(event_id)
            e.creating_method = eventdata.CM_WEB_SCRAPE
            events_to_update.append((e, json_body))

        event_updates.update_and_save_web_events(events_to_update)
        for event_id in new_ids:
            logging.info("New event, publishing to twitter/facebook: %s",
                         event_id)
            deferred.defer(pubsub.eventually_publish_event, event_id)

        process_upload_finalization(self.json_body['studio_name'])
        self.response.status = 200
Example #6
0
def migrate_potential_events(old_source_id, new_source_id):
    #STR_ID_MIGRATE
    potential_event_list = potential_events.PotentialEvent.gql(
        "WHERE source_ids = %s" % long(old_source_id)).fetch(100)

    for pe in potential_event_list:
        logging.info("old pe %s has ids: %s", pe.fb_event_id,
                     [x.id for x in pe.sources()])
        source_infos = set()
        for source in pe.sources():
            # remap ids
            if source.id == old_source_id:
                #STR_ID_MIGRATE
                source = source.copy()
                source.id = new_source_id
            source_infos.add(source)
        pe.set_sources(source_infos)
        logging.info("new pe %s has ids: %s", pe.fb_event_id,
                     [x.id for x in pe.sources()])
        pe.put()

    if len(potential_event_list):
        # Tail recursion via task queues!
        deferred.defer(migrate_potential_events, old_source_id, new_source_id)
Example #7
0
    def setup_login_state(self, request):
        #TODO(lambert): change fb api to not request access token, and instead pull it from the user
        # only request the access token from FB when it's been longer than a day, and do it out-of-band to fetch-and-update-db-and-memcache

        self.fb_uid = None
        self.user = None
        self.access_token = None

        if len(request.get_all('nt')) > 1:
            logging.error('Have too many nt= parameters, something is Very Wrong!')
            for k, v in request.cookies.iteritems():
                logging.info("DEBUG: cookie %r = %r", k, v)
        # Load Facebook cookie
        try:
            response = facebook.parse_signed_request_cookie(request.cookies)
        except Cookie.CookieError:
            logging.exception("Error processing cookie: %s")
            return
        fb_cookie_uid = None
        if response:
            fb_cookie_uid = response['user_id']
        logging.info("fb cookie id is %s", fb_cookie_uid)

        # Normally, our trusted source of login id is the FB cookie,
        # though we may override it below in the case of access_token_md5
        trusted_cookie_uid = fb_cookie_uid

        # for k, v in self.request.cookies.iteritems():
        #     logging.info('cookie %s = %s', k, v)

        # Load our dancedeets logged-in user/state
        our_cookie_uid = None
        user_login_string = self.get_login_cookie()
        if user_login_string:
            user_login_cookie = json.loads(urllib.unquote(user_login_string))
            logging.info("Got login cookie: %s", user_login_cookie)
            if validate_hashed_userlogin(user_login_cookie):
                our_cookie_uid = user_login_cookie['uid']
                # If we have a browser cookie that's verified via access_token_md5,
                # so let's trust it as authoritative here and ignore the fb cookie
                if not trusted_cookie_uid and user_login_cookie.get('access_token_md5'):
                    trusted_cookie_uid = our_cookie_uid
                    logging.info("Validated cookie, logging in as %s", our_cookie_uid)

        if self.request.cookies.get('user_login', ''):
            logging.info("Deleting old-style user_login cookie")
            self.response.set_cookie('user_login', '', max_age=0, path='/', domain=self._get_login_cookie_domain())

        # If the user has changed facebook users, let's automatically re-login at dancedeets
        if trusted_cookie_uid and trusted_cookie_uid != our_cookie_uid:
            self.set_login_cookie(trusted_cookie_uid)
            our_cookie_uid = trusted_cookie_uid

        # Don't force-logout the user if there is a our_cookie_uid but not a trusted_cookie_uid
        # The fb cookie probably expired after a couple hours, and we'd prefer to keep our users logged-in

        # Logged-out view, just return without setting anything up
        if not our_cookie_uid:
            return

        self.fb_uid = our_cookie_uid
        self.user = users.User.get_by_id(self.fb_uid)

        # If we have a user, grab the access token
        if self.user:
            if trusted_cookie_uid:
                # Long-lived tokens should last "around" 60 days, so let's refresh-renew if there's only 40 days left
                if self.user.fb_access_token_expires:
                    token_expires_soon = (self.user.fb_access_token_expires - datetime.datetime.now()) < datetime.timedelta(days=40)
                else:
                    # These are either infinite-access tokens (which won't expire soon)
                    # or they are ancient tokens (in which case, our User reload mapreduce has already set user.expired_oauth_token)
                    token_expires_soon = False
                # Update the access token if necessary
                if self.user.expired_oauth_token or token_expires_soon or self.request.get('update_fb_access_token'):
                    try:
                        access_token, access_token_expires = self.get_long_lived_token_and_expires(request)
                    except TypeError:
                        logging.info("Could not access cookie ")
                    except facebook.AlreadyHasLongLivedToken:
                        logging.info("Already have long-lived token, FB wouldn't give us a new one, so no need to refresh anything.")
                    else:
                        logging.info("New access token from cookie: %s, expires %s", access_token, access_token_expires)
                        if access_token:
                            self.user = users.User.get_by_id(self.fb_uid)
                            self.user.fb_access_token = access_token
                            self.user.fb_access_token_expires = access_token_expires
                            self.user.expired_oauth_token = False
                            self.user.expired_oauth_token_reason = None
                            # this also sets to memcache
                            self.user.put()
                            logging.info("Stored the new access_token to the User db")
                        else:
                            logging.error("Got a cookie, but no access_token. Using the one from the existing user. Strange!")
                if 'web' not in self.user.clients:
                    self.user = users.User.get_by_id(self.fb_uid)
                    self.user.clients.append('web')
                    self.user.put()
                    logging.info("Added the web client to the User db")
                self.access_token = self.user.fb_access_token
            else:
                self.access_token = self.user.fb_access_token
                logging.info("Have dd login cookie but no fb login cookie")
                if self.user.expired_oauth_token:
                    self.fb_uid = None
                    self.user = None
                    self.access_token = None
                    return
        elif trusted_cookie_uid:
            # if we don't have a user but do have a token, the user has granted us permissions, so let's construct the user now
            try:
                access_token, access_token_expires = self.get_long_lived_token_and_expires(request)
            except facebook.AlreadyHasLongLivedToken:
                logging.warning(
                    "Don't have user, just trusted_cookie_uid. And unable to get long lived token for the incoming request. Giving up and doing logged-out"
                )
                self.fb_uid = None
                self.access_token = None
                self.user = None
                return
            self.access_token = access_token
            # Fix this ugly import hack:
            fbl = fb_api.FBLookup(self.fb_uid, self.access_token)
            fbl.debug = 'fbl' in self.debug_list
            fb_user = fbl.get(fb_api.LookupUser, self.fb_uid)

            referer = self.get_cookie('User-Referer')
            city = self.request.get('city') or self.get_location_from_headers() or get_location(fb_user)
            logging.info("User passed in a city of %r, facebook city is %s", self.request.get('city'), get_location(fb_user))
            ip = ips.get_remote_ip(self.request)
            user_creation.create_user_with_fbuser(
                self.fb_uid, fb_user, self.access_token, access_token_expires, city, ip, send_email=True, referer=referer, client='web'
            )
            # TODO(lambert): handle this MUUUCH better
            logging.info("Not a /login request and there is no user object, constructed one realllly-quick, and continuing on.")
            self.user = users.User.get_by_id(self.fb_uid)
            # Should not happen:
            if not self.user:
                logging.error("We still don't have a user!")
                self.fb_uid = None
                self.access_token = None
                self.user = None
                return
        else:
            # no user, no trusted_cookie_uid, but we have fb_uid from the user_login cookie
            logging.error("We have a user_login cookie, but no user, and no trusted_cookie_uid. Acting as logged-out")
            self.fb_uid = None
            self.access_token = None
            self.user = None
            return

        logging.info("Logged in uid %s with name %s and token %s", self.fb_uid, self.user.full_name, self.access_token)

        # Track last-logged-in state
        hour_ago = datetime.datetime.now() - datetime.timedelta(hours=1)
        if not getattr(self.user, 'last_login_time', None) or self.user.last_login_time < hour_ago:
            # Do this in a separate request so we don't increase latency on this call
            deferred.defer(update_last_login_time, self.user.fb_uid, datetime.datetime.now(), _queue='slow-queue')
            backgrounder.load_users([self.fb_uid], allow_cache=False)
Example #8
0
    def _fetch_object_keys(self, object_keys_to_lookup):
        logging.info("BatchLookup: Fetching IDs from FB: %s", object_keys_to_lookup)
        # initiate RPCs
        object_keys_to_rpcs = {}
        for object_key in object_keys_to_lookup:
            cls, oid = break_key(object_key)
            cls.track_lookup()
            parts_to_urls = cls.get_lookups(oid)
            batch_list = [
                dict(method='GET', name=part_key, relative_url=url, omit_response_on_success=False) for (part_key, url) in parts_to_urls
            ]
            rpc, token = self._create_rpc_for_batch(batch_list, cls.use_access_token)
            object_keys_to_rpcs[object_key] = rpc, token

        # fetch RPCs
        fetched_objects = {}
        for object_key, (object_rpc, object_token) in object_keys_to_rpcs.iteritems():
            cls, oid = break_key(object_key)
            parts_to_urls = cls.get_lookups(oid)
            mini_batch_list = [dict(name=part_key, relative_url=url) for (part_key, url) in parts_to_urls]
            this_object = {}
            this_object['empty'] = None
            object_is_bad = False
            rpc_results = self._map_rpc_to_data(object_rpc)
            if isinstance(rpc_results, list):
                named_results = zip(mini_batch_list, rpc_results)
            elif rpc_results is None:
                logging.warning("BatchLookup: Has empty rpc_results, perhaps due to URL fetch timeout")
                object_is_bad = True
                named_results = []
            else:
                error_code = rpc_results.get('error', {}).get('code')
                error_type = rpc_results.get('error', {}).get('type')
                error_message = rpc_results.get('error', {}).get('message')
                # expired/invalidated OAuth token for User objects. We use one OAuth token per BatchLookup, so no use continuing...
                # we don't trigger on UserEvents objects since those are often optional and we don't want to break on those, or set invalid bits on those (get it from the User failures instead)
                if error_code == 190 and error_type == 'OAuthException':
                    logging.warning("Error with expired token: %s", object_token)
                    raise ExpiredOAuthToken(error_message)
                logging.error("BatchLookup: Error occurred on response, rpc_results is %s", rpc_results)
                object_is_bad = True
                named_results = []
            for batch_item, result in named_results:
                object_rpc_name = batch_item['name']
                if result is None:
                    logging.warning("BatchLookup: Got timeout when requesting %s", batch_item)
                    if object_rpc_name not in cls.optional_keys:
                        object_is_bad = True
                    continue
                object_result_code = result['code']
                try:
                    object_json = json.loads(result['body'])
                except:
                    logging.error('Error parsing result body for %r: %r', batch_item, result)
                    raise
                if object_result_code in [200, 400] and object_json is not None:
                    error_code = None
                    if type(object_json) == dict and ('error_code' in object_json or 'error' in object_json):
                        error_code = object_json.get('error_code', object_json.get('error', {}).get('code', None))
                    if error_code == 100:
                        # This means the event exists, but the current access_token is insufficient to query it
                        this_object['empty'] = EMPTY_CAUSE_INSUFFICIENT_PERMISSIONS
                    elif error_code == 21:
                        message = object_json['error']['message']
                        # Facebook gave us a huge hack when they decided to rename/merge page ids,
                        # and so we are forced to deal with remapping by parsing strings at this lowest level.
                        # "Page ID 289919164441106 was migrated to page ID 175608368718.  Please update your API calls to the new ID"
                        # But only do it once per object, so rely on object_is_bad to tell us whether we've been through this before
                        if not object_is_bad and re.search('Page ID \d+ was migrated to page ID \d+.', message):
                            from_id, to_id = re.findall(r'ID (\d+)', message)
                            if self.raise_on_page_redirect:
                                raise PageRedirectException(from_id, to_id)
                            else:
                                from event_scraper import thing_db_fixer
                                from util import deferred
                                logging.warning(message)
                                logging.warning("Executing deferred call to migrate to new ID, returning None here.")
                                deferred.defer(thing_db_fixer.function_migrate_thing_to_new_id, self, from_id, to_id)
                        object_is_bad = True
                    elif error_code in [
                        2,  # Temporary API error: An unexpected error has occurred. Please retry your request later.
                        2500,  # Dependent-lookup on non-existing field: Cannot specify an empty identifier.
                    ]:
                        # Handle errors as documented here: https://developers.facebook.com/docs/graph-api/using-graph-api/v2.0#errors
                        logging.warning("BatchLookup: Error code from FB server for %s: %s: %s", object_rpc_name, error_code, object_json)
                        if object_rpc_name not in cls.optional_keys:
                            object_is_bad = True
                    elif error_code:
                        logging.error("BatchLookup: Error code from FB server for %s: %s: %s", object_rpc_name, error_code, object_json)
                        if object_rpc_name not in cls.optional_keys:
                            object_is_bad = True
                    elif object_json == False:
                        this_object['empty'] = EMPTY_CAUSE_DELETED
                    else:
                        this_object[object_rpc_name] = object_json
                else:
                    logging.warning("BatchLookup: Got code %s when requesting %s: %s", object_result_code, batch_item, result)
                    if object_rpc_name not in cls.optional_keys:
                        object_is_bad = True
            if object_is_bad:
                logging.warning("BatchLookup: Failed to complete object: %s, only have keys %s", object_key, this_object.keys())
            else:
                fetched_objects[object_key] = this_object
        return fetched_objects
Example #9
0
def yield_load_fb_event(fbl, all_events):
    ctx = context.get()
    if ctx:
        params = ctx.mapreduce_spec.mapper.params
        disable_updates = params['disable_updates']
        only_if_updated = params['only_if_updated']
    else:
        disable_updates = []
        only_if_updated = True

    # Process web_events
    web_events = [x for x in all_events if not x.is_fb_event]
    events_to_update = []
    for web_event in web_events:
        if event_updates.need_forced_update(web_event):
            events_to_update.append((web_event, web_event.web_event))
    event_updates.update_and_save_web_events(events_to_update, disable_updates=disable_updates)

    # Now process fb_events
    db_events = [x for x in all_events if x.is_fb_event]
    logging.info("loading db events %s", [db_event.fb_event_id for db_event in db_events])

    fbl.request_multi(fb_api.LookupEvent, [x.fb_event_id for x in db_events])
    fbl.request_multi(fb_api.LookupEventAttending, [x.fb_event_id for x in db_events])
    # We load these too, just in case we want to check up on our auto-attendee criteria for events
    fbl.request_multi(fb_api.LookupEventAttendingMaybe, [x.fb_event_id for x in db_events])

    # fbl.request_multi(fb_api.LookupEventPageComments, [x.fb_event_id for x in db_events])
    fbl.batch_fetch()
    events_to_update = []
    empty_fb_event_ids = []
    for db_event in db_events:
        try:
            real_fb_event = fbl.fetched_data(fb_api.LookupEvent, db_event.fb_event_id)
            # If it's an empty fb_event with our main access token, and we have other tokens we'd like to try...
            # If there are no visible_to_fb_uids and we don't have permissions, then we don't do this...
            #
            # TODO: This would happen on event deletion?
            #
            # TODO: Also, who sets visible_to_fb_uids? Why didn't this event have any?
            # TODO: Who re-sets visible_to_fb_uids after it goes empty? Can we ensure that keeps going?
            #
            # TODO: And what happens if we have a deleted event, with visible_to_fb_uids, that we attempt to run and query, and nothing happens?
            # Should we distinguish between deleted (and inaccessible) and permissions-lost-to-token (and inaccessible)?
            #
            # TODO: Why doesn't this update the event? Because add_event_tuple_if_updating seems to do nothing, probably because no fb_event is returned
            if real_fb_event['empty'] == fb_api.EMPTY_CAUSE_INSUFFICIENT_PERMISSIONS and db_event.visible_to_fb_uids:
                empty_fb_event_ids.append(db_event.fb_event_id)
            else:
                # Otherwise if it's visible to our main token, or there are no other tokens to try, deal with it here.
                add_event_tuple_if_updating(events_to_update, fbl, db_event, only_if_updated)
        except fb_api.NoFetchedDataException as e:
            logging.info("No data fetched for event id %s: %s", db_event.fb_event_id, e)
    # Now trigger off a background reloading of empty fb_events
    if empty_fb_event_ids:
        logging.info("Couldn't fetch, using backup tokens for events: %s", empty_fb_event_ids)
        deferred.defer(
            load_fb_events_using_backup_tokens,
            empty_fb_event_ids,
            allow_cache=fbl.allow_cache,
            only_if_updated=only_if_updated,
            disable_updates=disable_updates
        )
    logging.info("Updating events: %s", [x[0].id for x in events_to_update])
    # And then re-save all the events in here
    if events_to_update:
        event_updates.update_and_save_fb_events(events_to_update, disable_updates=disable_updates)