def load(cls, db, **kwargs): """Load method for ORM arguments: db: instance of gtfsdb.Database keyword arguments: gtfs_directory: path to unzipped GTFS files batch_size: batch size for memory management """ log = logging.getLogger(cls.__module__) start_time = time.time() batch_size = kwargs.get('batch_size', config.DEFAULT_BATCH_SIZE) directory = None transport_mode = kwargs.get('transport_mode') modified_date = kwargs.get('modified_date') if cls.datasource == config.DATASOURCE_GTFS: directory = kwargs.get('gtfs_directory') elif cls.datasource == config.DATASOURCE_LOOKUP: directory = resource_filename('gtfsdb', 'data') records = [] file_path = os.path.join(directory, cls.filename) if os.path.exists(file_path): f = open(file_path, 'r') utf8_file = util.UTF8Recoder(f, 'utf-8-sig') reader = csv.DictReader(utf8_file) reader.fieldnames = [ field.strip().lower() for field in reader.fieldnames ] table = cls.__table__ #try: #db.engine.execute(table.delete()) #except: #log.debug("NOTE: couldn't delete this table") i = 0 for row in reader: row['transport_mode'] = transport_mode row['date_modified'] = modified_date records.append(cls.make_record(row)) i += 1 if i >= batch_size: db.engine.execute(table.insert(), records) sys.stdout.write('*') records = [] i = 0 if len(records) > 0: db.engine.execute(table.insert(), records) f.close() process_time = time.time() - start_time log.debug('{0}.load ({1:.0f} seconds)'.format(cls.__name__, process_time))
def load(cls, db, **kwargs): """ Load method for ORM arguments: db: instance of gtfsdb.Database keyword arguments: gtfs_directory: path to unzipped GTFS files batch_size: batch size for memory management """ # step 0: set up some vars, including setting the log output to show the child of base that we're processing start_time = time.time() batch_size = kwargs.get('batch_size', config.DEFAULT_BATCH_SIZE) log = logging.getLogger(cls.__module__) # step 1: check that we have elements of a file path (a file name and a directory) for the data we'll load if cls.filename is None: log.info( "{0} lacks a 'filename' attribute ... not loading a null file (exit load)." .format(cls.__name__)) return # note early exit if cls.datasource is not config.DATASOURCE_GTFS and cls.datasource is not config.DATASOURCE_LOOKUP: log.info( "{0}.datasource != DATASOURCE_GTFS or DATASOURCE_LOOKUP (exit load)." .format(cls.__name__)) return # note early exit # step 2: load either a GTFS file from the unzipped file or a resource file (from a dir specified in config) directory = None if cls.datasource == config.DATASOURCE_GTFS: directory = kwargs.get('gtfs_directory') elif cls.datasource == config.DATASOURCE_LOOKUP: directory = resource_filename('gtfsdb', 'data') # step 3: load the file log.info("load {0}".format(cls.__name__)) records = [] file_path = os.path.join(directory, cls.filename) if os.path.exists(file_path): if sys.version_info >= (3, 0): f = open(file_path, 'rb') else: f = open(file_path, 'r') utf8_file = util.UTF8Recoder(f, 'utf-8-sig') reader = csv.DictReader(utf8_file) reader.fieldnames = [ field.strip().lower() for field in reader.fieldnames ] table = cls.__table__ try: db.engine.execute(table.delete()) except: log.debug("NOTE: couldn't delete this table") i = 0 for row in reader: records.append(cls.make_record(row)) i += 1 if i >= batch_size: db.engine.execute(table.insert(), records) sys.stdout.write('*') records = [] i = 0 if len(records) > 0: db.engine.execute(table.insert(), records) f.close() # step 4: done... process_time = time.time() - start_time log.debug('{0}.load ({1:.0f} seconds)'.format(cls.__name__, process_time))