Esempio n. 1
0
def test_load_items():
    database = Database()
    database.delete_item('members', 'testid1')
    database.delete_item('members', 'testid2')

    columns = database.get_columns('members')
    item1 = {x: None for x in columns}
    item1['id'] = 'testid1'
    item2 = {x: None for x in columns}
    item2['id'] = 'testid2'
    items = [item1, item2]

    database.load_items(items, 'members')
    item1_ = database.get_item('members', 'testid1')
    assert item1_['id'] == 'testid1'
    item2_ = database.get_item('members', 'testid2')
    assert item2_['id'] == 'testid2'

    database.delete_item('members', 'testid1')
    item1_ = database.get_item('members', 'testid1')
    assert item1_ == None
    database.delete_item('members', 'testid2')
    item2_ = database.get_item('members', 'testid2')
    assert item2_ == None
class EventbriteLoader(object):
    """Loads data from Eventbrite into Postgres """
    def __init__(self, eventbrite_org, database=None):
        daiquiri.setup(level=logging.INFO)
        self.logger = daiquiri.getLogger(__name__)

        self.database = Database(database=database)
        self.eventbrite = Eventbrite()
        self.eventbrite_org = eventbrite_org

    def run(self, test=False):
        """ Runs the data load process """
        last_load_date = self.database.last_event_load_date()
        if last_load_date:
            look_back = datetime.datetime.now() - datetime.timedelta(days=1)
            first_event = min(look_back, last_load_date)
            start = str(first_event)[:10]
            self.logger.info('Loading events starting at %s' % (start))
        else:
            self.logger.info('Loading events from the first available event')
            start = None
        events = self.get_events(start=start, page=1)

        num_events = events['pagination']['object_count']
        if num_events > 0:
            self.logger.info('There are %s events to process' % (num_events))
        else:
            self.logger.info('There are not next events. Exiting')
            return

        more_events = True
        while more_events:
            for event in events['events']:
                if not event:
                    continue
                msg = "Loading information for %s" % (event['name']['text'])
                self.logger.info(msg)
                # Load the event into the database. Delete the current
                # entry in order to maintain the unique index
                event_id = event['id']
                if not test:
                    self.database.delete_item('events', event_id)
                    self.load_event(event)

                # Load the venue, if it does not already
                # appear in the database
                venue_id = event['venue_id']
                venue_ = self.database.get_item('venues', venue_id)
                if venue_id and not venue_:
                    venue = self.get_venue(venue_id)
                    if not test:
                        self.load_venue(venue)

                attendees = self.get_attendees(event_id, page=1)
                more_attendees = True
                while more_attendees:
                    if not attendees:
                        break
                    for attendee in attendees['attendees']:
                        if not attendee:
                            continue
                        if not test:
                            self.database.delete_item('attendees',
                                                      attendee['id'],
                                                      {'event_id': event_id})
                            self.load_attendee(attendee)

                    if test or not attendees['pagination']['has_more_items']:
                        more_attendees = False
                        break
                    else:
                        page = attendees['pagination']['page_number'] + 1
                        attendees = self.get_attendees(event_id, page)
                # Sleep to avoid the Eventbrite rate limit
                if test:
                    return
                else:
                    time.sleep(60)

            if not events['pagination']['has_more_items']:
                more_events = False
                break
            else:
                page = events['pagination']['page_number'] + 1
                msg = 'Pulling events on page %s' % (page)
                self.logger.info(msg)
                events = self.get_events(start, page)

    def get_events(self, start, page=1):
        """
        Pulls events from eventbrite and sleeps if the rate limit
        has been exceeded
        """
        org_id = self.eventbrite_org
        events = self.eventbrite.get_events(org_id=org_id,
                                            start=start,
                                            page=page)
        if not events:
            # Sleep until eventbrite resets
            self.logger.info('Rate limit exceed. Sleeping 30 mins')
            time.sleep(3600)
            events = self.eventbrite.get_events(start=start, page=page)
        return events

    def get_attendees(self, event_id, page=1):
        """
        Pulls attendees from eventbrite and sleeps if the rate limit
        has been exceeded
        """
        attendees = self.eventbrite.get_attendees(event_id, page)
        if not attendees:
            # If events comes back as none, sleep until the
            # Eventbrite rate limit resets
            self.logger.info('Rate limit exceed. Sleeping 30 mins')
            time.sleep(3600)
            attendees = self.eventbrite.get_attendees(event_id, page)
        return attendees

    def get_venue(self, venue_id, page=1):
        """
        Pull a venue and sleeps if the rate limit
        has been exceeded
        """
        venue = self.eventbrite.get_venue(venue_id, page)
        if not venue:
            self.logger.info('Rate limit exceed. Sleeping 30 mins')
            time.sleep(3600)
            venue = self.eventbrite.get_venue(event_id, page)
        return venue

    def load_event(self, event):
        """ Loads an event into the database """
        event_ = deepcopy(event)

        start = arrow.get(event_['start']['utc']).datetime
        event_['start_datetime'] = start

        end = arrow.get(event_['end']['utc']).datetime
        event_['end_datetime'] = end

        description = event_['description']['text']
        event_['description'] = description

        name = event_['name']['text']
        event_['name'] = name

        event_['load_datetime'] = datetime.datetime.utcnow()
        self.database.load_item(event_, 'events')

    def load_attendee(self, attendee):
        """ Loads an attendee into the database """
        attendee_ = deepcopy(attendee)

        profile = attendee_['profile']
        if 'name' in profile:
            attendee_['name'] = profile['name']
        if 'first_name' in profile:
            attendee_['first_name'] = profile['first_name']
        if 'last_name' in profile:
            attendee_['last_name'] = profile['last_name']
        if 'email' in profile:
            attendee_['email'] = profile['email']

        cost = attendee_['costs']['gross']['major_value']
        attendee_['cost'] = float(cost)

        attendee_['load_datetime'] = datetime.datetime.utcnow()
        self.database.load_item(attendee_, 'attendees')

    def load_order(self, order):
        """ Loads an order into the database """
        order_ = deepcopy(order)

        cost = order_['costs']['gross']['major_value']
        order_['cost'] = float(cost)

        order_['load_datetime'] = datetime.datetime.utcnow()
        self.database.load_item(order_, 'orders')

    def load_venue(self, venue):
        """ Loads a venue into the database """
        venue_ = deepcopy(venue)

        for key in venue_['address']:
            val = venue_['address'][key]
            venue_[key] = val

        venue_['latitude'] = float(venue_['latitude'])
        venue_['longitude'] = float(venue_['longitude'])
        self.database.load_item(venue_, 'venues')
class Geometries(object):
    """ Class for parsing and loading geojson files """
    def __init__(self, database=None):
        daiquiri.setup(level=logging.INFO)
        self.logger = daiquiri.getLogger(__name__)

        self.database = Database(database=database)
        self.path = os.path.dirname(os.path.realpath(__file__))
        self.url = 'https://www.zip-codes.com/cache/kml-zip/'
        self.search = SearchEngine(simple_zipcode=True)
        self.zip_code_cache = {}

    def load_all_zip_codes(self):
        """Loads all zipcodes geometries into the database. Pauses five
        seconds betwen loading each zipcode to avoid overwhelming the
        site we download the geometries from."""
        valid_zip_codes = self.get_all_zip_codes()
        for code in valid_zip_codes:
            try:
                self.logger.info("Loading geometry for {}".format(code))
                self.load_zip_code(code)
            except:
                self.logger.warning('Geojson load failed for {}'.format(code))
            time.sleep(5)

    def get_kml(self, zip_code):
        """ Pulls the KML file for a zip code """
        url = self.url + '%s.kml' % (zip_code)
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            return None

    def get_all_zip_codes(self):
        """Creates a list of valid zip codes by checking them
        against the US zip code search object."""
        possible_zip_codes = [str(i).rjust(5, '0') for i in range(99999)]
        valid_zip_codes = []
        for code in possible_zip_codes:
            results = self.get_zipcode_data(code)
            if results and results['zipcode'] != None:
                valid_zip_codes.append(code)

        return valid_zip_codes

    def load_zip_code(self, zip_code):
        """ Pulls a zip code and loads it into the database """
        # Fetch the KML file from the resource
        kml = self.get_kml(zip_code)
        if not kml:
            return

        filename = self.path + '/temp/%s.kml' % (zip_code)
        with open(filename, 'w') as f:
            f.write(kml)

        # Convert the KML file to GeoJSON
        kml2geojson.main.convert(filename, self.path + '/temp')

        # Load the file into the database
        geo_filename = self.path + '/temp/%s.geojson' % (zip_code)
        with open(geo_filename, 'r') as f:
            geo_json = json.load(f)

        zipcode_data = self.get_zipcode_data(zip_code)
        row = {
            'id': zip_code,
            'geometry': json.dumps(geo_json),
            'city': zipcode_data['major_city'],
            'county': zipcode_data['county'],
            'region': zipcode_data['state']
        }
        self.database.delete_item('geometries', zip_code)
        self.database.load_item(row, 'geometries')

        # Delete the temporary files
        os.remove(filename)
        os.remove(geo_filename)

    def get_zipcode_data(self, zipcode):
        """Pulls the city and county name for the specified zipcode."""
        if zipcode in self.zip_code_cache:
            return self.zip_code_cache[zipcode]
        else:
            results = self.search.by_zipcode(zipcode)
            if results.zipcode:
                zipcode_data = results.to_dict()
            else:
                zipcode_data = None
            # Cache city and state information for later so we don't
            # have to search against the search engine again
            self.zip_code_cache[zipcode] = zipcode_data
            return zipcode_data