# TODO: Why doesn't this update the event? Because add_event_tuple_if_updating seems to do nothing, probably because no fb_event is returned
            if real_fb_event['empty'] == fb_api.EMPTY_CAUSE_INSUFFICIENT_PERMISSIONS and db_event.visible_to_fb_uids:
                empty_fb_event_ids.append(db_event.fb_event_id)
            else:
                # Otherwise if it's visible to our main token, or there are no other tokens to try, deal with it here.
                add_event_tuple_if_updating(events_to_update, fbl, db_event, only_if_updated)
        except fb_api.NoFetchedDataException as e:
            logging.info("No data fetched for event id %s: %s", db_event.fb_event_id, e)
    # Now trigger off a background reloading of empty fb_events
    if empty_fb_event_ids:
        logging.info("Couldn't fetch, using backup tokens for events: %s", empty_fb_event_ids)
        deferred.defer(load_fb_events_using_backup_tokens, empty_fb_event_ids, allow_cache=fbl.allow_cache, only_if_updated=only_if_updated, update_geodata=update_geodata)
    logging.info("Updating events: %s", [x[0].id for x in events_to_update])
    # And then re-save all the events in here
    event_updates.update_and_save_fb_events(events_to_update, update_geodata=update_geodata)
map_load_fb_event = fb_mapreduce.mr_wrap(yield_load_fb_event)
load_fb_event = fb_mapreduce.nomr_wrap(yield_load_fb_event)


def yield_load_fb_event_attending(fbl, all_events):
    db_events = [x for x in all_events if x.is_fb_event]
    fbl.get_multi(fb_api.LookupEventAttending, [x.fb_event_id for x in db_events], allow_fail=True)
map_load_fb_event_attending = fb_mapreduce.mr_wrap(yield_load_fb_event_attending)
load_fb_event_attending = fb_mapreduce.nomr_wrap(yield_load_fb_event_attending)


def mr_load_fb_events(fbl, load_attending=False, time_period=None, update_geodata=True, only_if_updated=True, queue='slow-queue'):
    if load_attending:
        event_or_attending = 'Event Attendings'
        mr_func = 'map_load_fb_event_attending'
    else:
Ejemplo n.º 2
0
        try:
            discovered_list.update(_process_thing_feed(fbl, source))
        except fb_api.NoFetchedDataException, e:
            logging.warning("Failed to fetch data for thing: %s", str(e))
    logging.info("Discovered %s items: %s", len(discovered_list), discovered_list)
    return discovered_list


def scrape_events_from_source_ids(fbl, source_ids):
    sources = thing_db.Source.get_by_key_name(source_ids)
    sources = [x for x in sources if x]
    logging.info("Looking up %s source_ids, found %s sources", len(source_ids), len(sources))
    scrape_events_from_sources(fbl, sources)


map_scrape_events_from_sources = fb_mapreduce.mr_wrap(scrape_events_from_sources)


def mapreduce_scrape_all_sources(fbl, min_potential_events=None, queue='slow-queue'):
    # Do not do the min_potential_events>1 filter in the mapreduce filter,
    # or it will want to do a range-shard on that property. Instead, pass-it-down
    # and use it as an early-return in the per-Source processing.
    # TODO:....maybe we do want a range-shard filter? save on loading all the useless sources...
    fb_mapreduce.start_map(
        fbl,
        'Scrape All Sources',
        'event_scraper.thing_scraper.map_scrape_events_from_sources',
        'event_scraper.thing_db.Source',
        handle_batch_size=10,
        extra_mapper_params={'min_potential_events': min_potential_events},
        queue=queue,
Ejemplo n.º 3
0
        except fb_api.NoFetchedDataException as e:
            logging.info("No data fetched for event id %s: %s",
                         db_event.fb_event_id, e)
    # Now trigger off a background reloading of empty fb_events
    if empty_fb_event_ids:
        deferred.defer(load_fb_events_using_backup_tokens,
                       empty_fb_event_ids,
                       allow_cache=fbl.allow_cache,
                       only_if_updated=only_if_updated,
                       update_geodata=update_geodata)
    # And then re-save all the events in here
    event_updates.update_and_save_fb_events(events_to_update,
                                            update_geodata=update_geodata)


map_load_fb_event = fb_mapreduce.mr_wrap(yield_load_fb_event)
load_fb_event = fb_mapreduce.nomr_wrap(yield_load_fb_event)


def yield_load_fb_event_attending(fbl, all_events):
    db_events = [x for x in all_events if x.is_fb_event]
    fbl.get_multi(fb_api.LookupEventAttending,
                  [x.fb_event_id for x in db_events])


map_load_fb_event_attending = fb_mapreduce.mr_wrap(
    yield_load_fb_event_attending)
load_fb_event_attending = fb_mapreduce.nomr_wrap(yield_load_fb_event_attending)


def mr_load_fb_events(fbl,
Ejemplo n.º 4
0
    fbl.request_multi(fb_api.LookupEvent, [x.fb_event_id for x in pe_list])
    fbl.batch_fetch()

    csv_file = StringIO.StringIO()
    csv_writer = csv.writer(csv_file)

    for pe in pe_list:
        try:
            result = json.dumps(fbl.fetched_data(fb_api.LookupEvent, pe.fb_event_id))
            cache_key = fbl.key_to_cache_key(fb_api.generate_key(fb_api.LookupEvent, pe.fb_event_id))
            csv_writer.writerow([cache_key, result])
        except fb_api.NoFetchedDataException:
            logging.error("skipping row for event id %s", pe.fb_event_id)
    yield csv_file.getvalue()

map_dump_fb_json = fb_mapreduce.mr_wrap(dump_fb_json)

def mr_dump_events(fbl):
    fb_mapreduce.start_map(
        fbl,
        'Dump Potential FB Event Data',
        'logic.mr_dump.map_dump_fb_json',
        'event_scraper.potential_events.PotentialEvent',
        handle_batch_size=80,
        queue=None,
        filters=[('looked_at', '=', None)],
        output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter',
        output_writer={
            'mime_type': 'text/plain',
            'bucket_name': 'dancedeets-hrd.appspot.com',
        },
Ejemplo n.º 5
0
        source.put()
        if new_source:
            backgrounder.load_sources([source_id], fb_uid=fbl.fb_uid)
        return source
    return None


def create_sources_from_event(fbl, db_event):
    logging.info('create_sources_from_event: %s', db_event.id)
    create_source_from_id(fbl, db_event.owner_fb_uid)
    for admin in db_event.admins:
        if admin['id'] != db_event.owner_fb_uid:
            create_source_from_id(fbl, admin['id'])


map_create_sources_from_event = fb_mapreduce.mr_wrap(create_sources_from_event)


def explode_per_source_count(pe):
    db_event = eventdata.DBEvent.get_by_id(pe.fb_event_id)

    is_potential_event = pe.match_score > 0
    real_event = db_event != None
    false_negative = bool(db_event and not is_potential_event)
    result = (is_potential_event, real_event, false_negative)

    for source_id in pe.source_ids_only():
        yield (source_id, json.dumps(result))


def combine_source_count(source_id, counts_to_sum):
Ejemplo n.º 6
0
            pe2.auto_looked_at = True
            pe2.put()
            result = '-%s\n' % '\t'.join(unicode(x) for x in (pe.fb_event_id, fb_event['info'].get('name', '')))
            results.append(result)
            mr.increment('auto-notadded-dance-events')
    return results


def classify_events_with_yield(fbl, pe_list):
    assert fbl.allow_cache
    fb_list = fbl.get_multi(fb_api.LookupEvent, [x.fb_event_id for x in pe_list], allow_fail=True)
    # DISABLE_ATTENDING
    results = classify_events(fbl, pe_list, fb_list)
    yield ''.join(results).encode('utf-8')

map_classify_events = fb_mapreduce.mr_wrap(classify_events_with_yield)


def mr_classify_potential_events(fbl, past_event):
    filters = [('looked_at', '=', None), ('should_look_at', '=', True)]
    if past_event is not None:
        filters.append(('past_event', '=', past_event))
    fb_mapreduce.start_map(
        fbl,
        'Auto-Add Events',
        'event_scraper.auto_add.map_classify_events',
        'event_scraper.potential_events.PotentialEvent',
        filters=filters,
        handle_batch_size=20,
        queue='fast-queue',
        output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter',
Ejemplo n.º 7
0
    source.compute_derived_properties(fb_data)
    logging.info('Getting source for id %s: %s', source.graph_id, source.name)
    return source


def create_source_from_event(fbl, db_event):
    if not db_event.owner_fb_uid:
        return
    # technically we could check if the object exists in the db, before we bother fetching the feed
    thing_feed = fbl.get(fb_api.LookupThingFeed, db_event.owner_fb_uid)
    if not thing_feed['empty']:
        s = create_source_for_id(db_event.owner_fb_uid, thing_feed)
        s.put()


map_create_source_from_event = fb_mapreduce.mr_wrap(create_source_from_event)


def export_sources(fbl, sources):
    fbl.request_multi(fb_api.LookupThingFeed, [x.graph_id for x in sources])
    fbl.batch_fetch()
    for source in sources:
        try:
            thing_feed = fbl.fetched_data(fb_api.LookupThingFeed,
                                          source.graph_id)
            if 'info' not in thing_feed:
                continue
            name = thing_feed['info'].get('name', '').encode('utf8')
            desc = thing_feed['info'].get('description', '').encode('utf8')
            fields = (
                source.graph_id,
Ejemplo n.º 8
0
def create_source_for_id(source_id, fb_data):
    source = Source.get_by_key_name(source_id) or Source(key_name=source_id, street_dance_related=False)
    source.compute_derived_properties(fb_data)
    logging.info('Getting source for id %s: %s', source.graph_id, source.name)
    return source

def create_source_from_event(fbl, db_event):
    if not db_event.owner_fb_uid:
        return
    # technically we could check if the object exists in the db, before we bother fetching the feed
    thing_feed = fbl.get(fb_api.LookupThingFeed, db_event.owner_fb_uid)
    if not thing_feed['empty']:
        s = create_source_for_id(db_event.owner_fb_uid, thing_feed)
        s.put()
map_create_source_from_event = fb_mapreduce.mr_wrap(create_source_from_event)

def export_sources(fbl, sources):
    fbl.request_multi(fb_api.LookupThingFeed, [x.graph_id for x in sources])
    fbl.batch_fetch()
    for source in sources:
        try:
            thing_feed = fbl.fetched_data(fb_api.LookupThingFeed, source.graph_id)
            if 'info' not in thing_feed:
                continue
            name = thing_feed['info'].get('name', '').encode('utf8')
            desc = thing_feed['info'].get('description', '').encode('utf8')
            fields = (
                source.graph_id,
                source.graph_type,
                source.creation_time,
Ejemplo n.º 9
0
    for source in sources:
        try:
            thing_feed = fbl.fetched_data(fb_api.LookupThingFeed, source.graph_id)
            discovered_list.update(process_thing_feed(source, thing_feed))
        except fb_api.NoFetchedDataException, e:
            logging.warning("Failed to fetch data for thing: %s", str(e))
    logging.info("Discovered %s items: %s", len(discovered_list), discovered_list)
    return discovered_list

def scrape_events_from_source_ids(fbl, source_ids):
    sources = thing_db.Source.get_by_key_name(source_ids)
    sources = [x for x in sources if x]
    logging.info("Looking up %s source_ids, found %s sources", len(source_ids), len(sources))
    scrape_events_from_sources(fbl, sources)

map_scrape_events_from_sources = fb_mapreduce.mr_wrap(scrape_events_from_sources)

def mapreduce_scrape_all_sources(fbl, min_potential_events=None, queue='super-slow-queue'):
    # Do not do the min_potential_events>1 filter in the mapreduce filter,
    # or it will want to do a range-shard on that property. Instead, pass-it-down
    # and use it as an early-return in the per-Source processing.
    # TODO:....maybe we do want a range-shard filter? save on loading all the useless sources...
    fb_mapreduce.start_map(
        fbl,
        'Scrape All Sources',
        'event_scraper.thing_scraper.map_scrape_events_from_sources',
        'event_scraper.thing_db.Source',
        handle_batch_size=10,
        extra_mapper_params={'min_potential_events': min_potential_events},
        queue=queue,
        randomize_tokens=True,
Ejemplo n.º 10
0
            if 'info' not in fbe:
                logging.error("skipping row2 for event id %s", e.fb_event_id)
                continue
            attendees = fb_api.get_all_members_count(fbe)
            if not fb_events.is_public(fbe) and fb_events.is_public_ish(fbe):
                mr.increment('nonpublic-and-large')
            privacy = fbe['info'].get('privacy', 'UNKNOWN')
            mr.increment('privacy-%s' % privacy)

            start_date = e.start_time.strftime('%Y-%m-%d') if e.start_time else ''
            yield '%s\n' % '\t'.join(str(x) for x in [e.fb_event_id, start_date, privacy, attendees])
        except fb_api.NoFetchedDataException:
            logging.error("skipping row for event id %s", e.fb_event_id)


map_dump_private_events = fb_mapreduce.mr_wrap(count_private_events)


def mr_private_events(fbl):
    fb_mapreduce.start_map(
        fbl,
        'Dump Private Events',
        'servlets.tools.map_dump_private_events',
        'events.eventdata.DBEvent',
        handle_batch_size=80,
        queue=None,
        output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter',
        output_writer={
            'mime_type': 'text/plain',
            'bucket_name': 'dancedeets-hrd.appspot.com',
        },
Ejemplo n.º 11
0
            attendees = fb_api.get_all_members_count(fbe)
            privacy = fbe['info'].get('privacy', 'OPEN')
            if privacy != 'OPEN' and attendees > 60:
                mr.increment('nonpublic-and-large')
            mr.increment('privacy-%s' % privacy)

            start_date = e.start_time.strftime(
                '%Y-%m-%d') if e.start_time else ''
            yield '%s\n' % '\t'.join(
                str(x)
                for x in [e.fb_event_id, start_date, privacy, attendees])
        except fb_api.NoFetchedDataException:
            logging.error("skipping row for event id %s", e.fb_event_id)


map_dump_private_events = fb_mapreduce.mr_wrap(count_private_events)


def mr_private_events(fbl):
    fb_mapreduce.start_map(
        fbl,
        'Dump Private Events',
        'servlets.tools.map_dump_private_events',
        'events.eventdata.DBEvent',
        handle_batch_size=80,
        queue=None,
        output_writer_spec=
        'mapreduce.output_writers.GoogleCloudStorageOutputWriter',
        output_writer={
            'mime_type': 'text/plain',
            'bucket_name': 'dancedeets-hrd.appspot.com',
Ejemplo n.º 12
0
                fb_event = fbl.fetched_data(fb_api.LookupEvent, pe.fb_event_id)
                fb_event_attending = fbl.fetched_data(fb_api.LookupEventAttending, pe.fb_event_id)
            except fb_api.NoFetchedDataException:
                continue
            if fb_event['empty']:
                continue
            predict_service = predict_service or gprediction.get_predict_service()
            pe = potential_events.update_scores_for_potential_event(pe, fb_event, fb_event_attending, predict_service)
        logging.info("%s has ms=%s, d=%s, nd=%s", pe.fb_event_id, pe.match_score, pe.dance_bias_score, pe.non_dance_bias_score)
        if pe.dance_bias_score > 0.5 and pe.non_dance_bias_score > 0.5:
            result = '%s:%s:%s:%s\n' % (pe.fb_event_id, pe.match_score, pe.dance_bias_score, pe.non_dance_bias_score)
            results.append(result)
    yield ''.join(results).encode('utf-8')


map_classify_events = fb_mapreduce.mr_wrap(classify_events)

def mr_classify_potential_events(fbl):
    fb_mapreduce.start_map(
        fbl,
        'Auto-Classify Events',
        'ml.mr_prediction.map_classify_events',
        'event_scraper.potential_events.PotentialEvent',
        filters=[('looked_at', '=', None)],
        handle_batch_size=20,
        queue='slow-queue',
        output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter',
        output_writer={
            'mime_type': 'text/plain',
            'bucket_name': 'dancedeets-hrd.appspot.com',
        },
Ejemplo n.º 13
0
                else:
                    mr.increment('auto-added-dance-events-future')
            except fb_api.NoFetchedDataException as e:
                logging.error("Error adding event %s, no fetched data: %s", event_id, e)
            except add_entities.AddEventException as e:
                logging.warning("Error adding event %s, no fetched data: %s", event_id, e)
    return results


def classify_events_with_yield(fbl, pe_list):
    fb_list = fbl.get_multi(fb_api.LookupEvent, [x.fb_event_id for x in pe_list], allow_fail=True)
    results = classify_events(fbl, pe_list, fb_list)
    yield ''.join(results).encode('utf-8')


map_classify_events = fb_mapreduce.mr_wrap(classify_events_with_yield)


def mr_classify_potential_events(fbl, past_event, dancey_only):
    filters = []
    if dancey_only:
        filters.append(('should_look_at', '=', True))
    if past_event is not None:
        filters.append(('past_event', '=', past_event))
    fb_mapreduce.start_map(
        fbl,
        'Auto-Add Events',
        'event_scraper.auto_add.map_classify_events',
        'event_scraper.potential_events.PotentialEvent',
        filters=filters,
        handle_batch_size=20,
Ejemplo n.º 14
0
            db_event.visible_to_fb_uids = []
            db_event.put()
            # Let's update the DBEvent as necessary (note, this uses the last-updated FBLookup)
            # Unfortunately, we failed to get anything in our fbl, as it was raising an ExpiredOAuthToken
            # So instead, let's call it and just have it use the db_event.fb_event
            if fbl:
                add_event_tuple_if_updating(events_to_update, fbl, db_event, only_if_updated)
    if events_to_update:
        event_updates.update_and_save_fb_events(events_to_update, disable_updates=disable_updates)


def yield_resave_display_event(fbl, all_events):
    event_updates.resave_display_events(all_events)


map_resave_display_event = fb_mapreduce.mr_wrap(yield_resave_display_event)


def yield_load_fb_event(fbl, all_events):
    ctx = context.get()
    if ctx:
        params = ctx.mapreduce_spec.mapper.params
        disable_updates = params['disable_updates']
        only_if_updated = params['only_if_updated']
    else:
        disable_updates = []
        only_if_updated = True

    # Process web_events
    web_events = [x for x in all_events if not x.is_fb_event]
    events_to_update = []
Ejemplo n.º 15
0
        if not potential_event.looked_at:
            continue
        try:
            good_event = potential_event.fb_event_id in good_event_ids and 'dance' or 'nodance'

            fb_event = fbl.fetched_data(fb_api.LookupEvent, potential_event.fb_event_id)
            if fb_event['empty']:
                continue
            fb_event_attending = fbl.fetched_data(fb_api.LookupEventAttending, potential_event.fb_event_id)

            training_features = get_training_features(potential_event, fb_event, fb_event_attending)
            csv_writer.writerow([good_event] + list(training_features))
        except fb_api.NoFetchedDataException:
            logging.info("No data fetched for event id %s", potential_event.fb_event_id)
    yield csv_file.getvalue()
map_training_data_for_pevents = fb_mapreduce.mr_wrap(training_data_for_pevents)

def get_training_features(potential_event, fb_event, fb_event_attending):
    if 'owner' in fb_event['info']:
        owner_name = 'id%s' % fb_event['info']['owner']['id']
    else:
        owner_name = ''
    location = event_locations.get_address_for_fb_event(fb_event).encode('utf-8')
    def strip_text(s):
        return strip_punctuation(s.encode('utf8')).lower()
    name = strip_text(fb_event['info'].get('name', ''))
    description = strip_text(fb_event['info'].get('description', ''))

    attendee_list = ' '.join(['id%s' % x['id'] for x in fb_event_attending['attending']['data']])

    source_list = ' '.join('id%s' % x for x in potential_event.source_ids)
Ejemplo n.º 16
0
    csv_file = StringIO.StringIO()
    csv_writer = csv.writer(csv_file)

    for pe in pe_list:
        try:
            result = json.dumps(
                fbl.fetched_data(fb_api.LookupEvent, pe.fb_event_id))
            cache_key = fbl.key_to_cache_key(
                fb_api.generate_key(fb_api.LookupEvent, pe.fb_event_id))
            csv_writer.writerow([cache_key, result])
        except fb_api.NoFetchedDataException:
            logging.error("skipping row for event id %s", pe.fb_event_id)
    yield csv_file.getvalue()


map_dump_fb_json = fb_mapreduce.mr_wrap(dump_fb_json)


def mr_dump_events(fbl):
    fb_mapreduce.start_map(
        fbl,
        'Dump Potential FB Event Data',
        'logic.mr_dump.map_dump_fb_json',
        'event_scraper.potential_events.PotentialEvent',
        handle_batch_size=80,
        queue=None,
        filters=[('looked_at', '=', None)],
        output_writer_spec=
        'mapreduce.output_writers.GoogleCloudStorageOutputWriter',
        output_writer={
            'mime_type': 'text/plain',