def main(): parser = argparse.ArgumentParser( description='Scrape data for single bill, saving data to disk.', parents=[base_arg_parser], ) parser.add_argument('module', type=str, help='scraper module (eg. nc)') parser.add_argument('chamber', type=str, help='chamber for bill to scrape') parser.add_argument('session', type=str, help='session for bill to scrape') parser.add_argument('bill_id', type=str, help='bill_id to scrape') parser.add_argument('--strict', action='store_true', dest='strict', default=False, help="fail immediately when" "encountering validation warning") parser.add_argument('-n', '--no_cache', action='store_true', dest='no_cache', help="don't use web page cache") parser.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm', default=60), parser.add_argument('--import', dest='do_import', help="import bill after scrape", action="store_true", default=False) args = parser.parse_args() settings.update(args) # set up search path sys.path.insert( 0, os.path.join(os.path.dirname(__file__), '../../openstates')) # get metadata metadata = __import__(args.module, fromlist=['metadata']).metadata abbr = metadata['abbreviation'] # configure logger configure_logging(args.verbose, abbr) args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbr) _run_scraper(args, metadata) if args.do_import: import_bills(abbr, settings.BILLY_DATA_DIR)
def main(): parser = argparse.ArgumentParser( description='Import scraped data into database.', parents=[base_arg_parser], ) parser.add_argument('abbreviation', type=str, help=('the short name of the data to import')) parser.add_argument('-r', '--rpm', type=int, default=60, help=('maximum number of documents to download ' 'per minute')) parser.add_argument('--bills', action='store_true', help='scrape bill data') parser.add_argument('--legislators', action='store_true', help='scrape legislator data') parser.add_argument('--committees', action='store_true', help='scrape (separate) committee data') parser.add_argument('--events', action='store_true', help='scrape event data') parser.add_argument('--alldata', action='store_true', dest='alldata', default=False, help="import all available data") args = parser.parse_args() if not (args.bills or args.legislators or args.committees or args.events or args.alldata): raise Exception("Must specify at least one type: --bills, " "--legislators, --committees, --events, " "--alldata") settings.update(args) data_dir = settings.BILLY_DATA_DIR # configure logger configure_logging(args.verbose, args.abbreviation) # always import metadata import_metadata(args.abbreviation, data_dir) if args.legislators or args.alldata: import_legislators(args.abbreviation, data_dir) if args.bills or args.alldata: import_bills(args.abbreviation, data_dir) if args.committees or args.alldata: import_committees(args.abbreviation, data_dir) # events currently excluded from --alldata if args.events: import_events(args.abbreviation, data_dir)
def main(): parser = argparse.ArgumentParser( description='Scrape data for state, saving data to disk.', parents=[base_arg_parser], ) parser.add_argument('state', type=str, help='state scraper module (eg. nc)') parser.add_argument('chamber', type=str, help='chamber for bill to scrape') parser.add_argument('session', type=str, help='session for bill to scrape') parser.add_argument('bill_id', type=str, help='bill_id to scrape') parser.add_argument('--strict', action='store_true', dest='strict', default=False, help="fail immediately when" "encountering validation warning") parser.add_argument('-n', '--no_cache', action='store_true', dest='no_cache', help="don't use web page cache") parser.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm', default=60), parser.add_argument('--import', dest='do_import', help="import bill after scrape", action="store_true", default=False) args = parser.parse_args() settings.update(args) # set up search path sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../openstates')) # get metadata metadata = __import__(args.state, fromlist=['metadata']).metadata state = metadata['abbreviation'] # configure logger configure_logging(args.verbose, state) args.output_dir = os.path.join(settings.BILLY_DATA_DIR, args.state) _run_scraper(args.state, state, args, metadata) if args.do_import: import_bills(args.state, settings.BILLY_DATA_DIR)
def main(): try: parser = argparse.ArgumentParser( description="Scrape data for single bill, saving data to disk.", parents=[base_arg_parser] ) parser.add_argument("module", type=str, help="scraper module (eg. nc)") parser.add_argument("chamber", type=str, help="chamber for bill to scrape") parser.add_argument("session", type=str, help="session for bill to scrape") parser.add_argument("bill_id", type=str, help="bill_id to scrape") parser.add_argument( "--strict", action="store_true", dest="strict", default=False, help="fail immediately when" "encountering validation warning", ) parser.add_argument("-n", "--no_cache", action="store_true", dest="no_cache", help="don't use web page cache") parser.add_argument("--fastmode", help="scrape in fast mode", action="store_true", default=False) parser.add_argument("-r", "--rpm", action="store", type=int, dest="rpm", default=60), parser.add_argument( "--import", dest="do_import", help="import bill after scrape", action="store_true", default=False ) args = parser.parse_args() settings.update(args) # set up search path sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../openstates")) # get metadata metadata = __import__(args.module, fromlist=["metadata"]).metadata abbr = metadata["abbreviation"] # configure logger configure_logging(args.verbose, abbr) args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbr) _run_scraper(args, metadata) if args.do_import: import_bills(abbr, settings.BILLY_DATA_DIR) except ScrapeError as e: print "Error:", e sys.exit(1)
def _do_imports(abbrev, args): # do imports here so that scrape doesn't depend on mongo from billy.importers.metadata import import_metadata from billy.importers.bills import import_bills from billy.importers.legislators import import_legislators from billy.importers.committees import import_committees from billy.importers.events import import_events from billy.importers.speeches import import_speeches # always import metadata and districts import_metadata(abbrev) report = {} if 'legislators' in args.types: report['legislators'] = \ import_legislators(abbrev, settings.BILLY_DATA_DIR) if 'bills' in args.types: report['bills'] = import_bills(abbrev, settings.BILLY_DATA_DIR) if 'committees' in args.types: report['committees'] = \ import_committees(abbrev, settings.BILLY_DATA_DIR) if 'events' in args.types or 'speeches' in args.types: report['events'] = import_events(abbrev, settings.BILLY_DATA_DIR) if 'speeches' in args.types: report['speeches'] = import_speeches(abbrev, settings.BILLY_DATA_DIR) return report
def main(): try: parser = argparse.ArgumentParser( description='Scrape data for single bill, saving data to disk.', parents=[base_arg_parser], ) parser.add_argument('module', type=str, help='scraper module (eg. nc)') parser.add_argument('chamber', type=str, help='chamber for bill to scrape') parser.add_argument('session', type=str, help='session for bill to scrape') parser.add_argument('bill_id', type=str, help='bill_id to scrape') parser.add_argument('--strict', action='store_true', dest='strict', default=False, help="fail immediately when" "encountering validation warning") parser.add_argument('-n', '--no_cache', action='store_true', dest='no_cache', help="don't use web page cache") parser.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm', default=60), parser.add_argument('--import', dest='do_import', help="import bill after scrape", action="store_true", default=False) args = parser.parse_args() settings.update(args) # get metadata metadata = __import__(args.module, fromlist=['metadata']).metadata abbr = metadata['abbreviation'] # configure logger configure_logging(args.verbose, abbr) args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbr) _run_scraper(args, metadata) if args.do_import: import_bills(abbr, settings.BILLY_DATA_DIR) except ScrapeError as e: print 'Error:', e sys.exit(1)
def _do_imports(abbrev, args): # do imports here so that scrape doesn't depend on mongo from billy.importers.metadata import import_metadata from billy.importers.bills import import_bills from billy.importers.legislators import import_legislators from billy.importers.committees import import_committees from billy.importers.events import import_events from billy.importers.speeches import import_speeches # always import metadata and districts import_metadata(abbrev) dist_filename = os.path.join(settings.BILLY_MANUAL_DATA_DIR, 'districts', '%s.csv' % abbrev) if os.path.exists(dist_filename): db.districts.remove({'abbr': abbrev}) dist_csv = unicodecsv.DictReader(open(dist_filename)) for dist in dist_csv: dist['_id'] = '%(abbr)s-%(chamber)s-%(name)s' % dist dist['boundary_id'] = dist['boundary_id'] % dist dist['num_seats'] = int(dist['num_seats']) _log.debug(dist) db.districts.save(dist, safe=True) else: logging.getLogger('billy').warning("%s not found, continuing without " "districts" % dist_filename) report = {} if 'legislators' in args.types: report['legislators'] = import_legislators( abbrev, settings.BILLY_DATA_DIR ) if 'bills' in args.types: report['bills'] = import_bills(abbrev, settings.BILLY_DATA_DIR) if 'committees' in args.types: report['committees'] = import_committees( abbrev, settings.BILLY_DATA_DIR ) if 'events' in args.types or 'speeches' in args.types: report['events'] = import_events(abbrev, settings.BILLY_DATA_DIR) if 'speeches' in args.types: report['speeches'] = import_speeches(abbrev, settings.BILLY_DATA_DIR) return report
def _do_imports(abbrev, args): # do imports here so that scrape doesn't depend on mongo from billy.importers.metadata import import_metadata from billy.importers.bills import import_bills from billy.importers.legislators import import_legislators from billy.importers.committees import import_committees from billy.importers.events import import_events from billy.importers.speeches import import_speeches # always import metadata and districts import_metadata(abbrev) dist_filename = os.path.join(settings.BILLY_MANUAL_DATA_DIR, 'districts', '%s.csv' % abbrev) if os.path.exists(dist_filename): db.districts.remove({'abbr': abbrev}) dist_csv = unicodecsv.DictReader(open(dist_filename)) for dist in dist_csv: dist['_id'] = '%(abbr)s-%(chamber)s-%(name)s' % dist dist['boundary_id'] = dist['boundary_id'] % dist dist['num_seats'] = int(dist['num_seats']) db.districts.save(dist, safe=True) else: logging.getLogger('billy').warning("%s not found, continuing without " "districts" % dist_filename) report = {} if 'legislators' in args.types: report['legislators'] = \ import_legislators(abbrev, settings.BILLY_DATA_DIR) if 'bills' in args.types: report['bills'] = import_bills(abbrev, settings.BILLY_DATA_DIR) if 'committees' in args.types: report['committees'] = \ import_committees(abbrev, settings.BILLY_DATA_DIR) if 'events' in args.types or 'speeches' in args.types: report['events'] = import_events(abbrev, settings.BILLY_DATA_DIR) if 'speeches' in args.types: report['speeches'] = import_speeches(abbrev, settings.BILLY_DATA_DIR) return report
# configure logger if args.verbose == 0: verbosity = logging.WARNING elif args.verbose == 1: verbosity = logging.INFO else: verbosity = logging.DEBUG logging.basicConfig(level=verbosity, format="%(asctime)s %(name)s %(levelname)s %(message)s", datefmt="%H:%M:%S") # always import metadata import_metadata(args.state, data_dir) if args.legislators or args.alldata: import_legislators(args.state, data_dir) if args.bills or args.alldata: import_bills(args.state, data_dir) if args.committees or args.alldata: import_committees(args.state, data_dir) if args.votes or args.alldata: import_votes(args.state, data_dir) # events and versions currently excluded from --alldata if args.events: import_events(args.state, data_dir) if args.versions: import_versions(args.state, args.rpm)
default=False, help="import all available data") args = parser.parse_args() if not (args.bills or args.legislators or args.committees or args.events or args.alldata): raise Exception("Must specify at least one type: --bills, " "--legislators, --committees, --events, " "--alldata") settings.update(args) data_dir = settings.BILLY_DATA_DIR # configure logger configure_logging(args.verbose, args.abbreviation) # always import metadata import_metadata(args.abbreviation, data_dir) if args.legislators or args.alldata: import_legislators(args.abbreviation, data_dir) if args.bills or args.alldata: import_bills(args.abbreviation, data_dir) if args.committees or args.alldata: import_committees(args.abbreviation, data_dir) # events currently excluded from --alldata if args.events: import_events(args.abbreviation, data_dir)
args = parser.parse_args() if not (args.bills or args.legislators or args.committees or args.events or args.versions or args.alldata): raise Exception("Must specify at least one type: --bills, " "--legislators, --committees, --events, " "--versions, --alldata") settings.update(args) data_dir = settings.BILLY_DATA_DIR # configure logger configure_logging(args.verbose, args.state) # always import metadata import_metadata(args.state, data_dir) if args.legislators or args.alldata: import_legislators(args.state, data_dir) if args.bills or args.alldata: import_bills(args.state, data_dir) if args.committees or args.alldata: import_committees(args.state, data_dir) # events and versions currently excluded from --alldata if args.events: import_events(args.state, data_dir) if args.versions: import_versions(args.state, args.rpm)
help="import all available data") args = parser.parse_args() if not (args.bills or args.legislators or args.committees or args.events or args.alldata): raise Exception("Must specify at least one type: --bills, " "--legislators, --committees, --events, " "--alldata") settings.update(args) data_dir = settings.BILLY_DATA_DIR # configure logger configure_logging(args.verbose, args.abbreviation) # always import metadata import_metadata(args.abbreviation, data_dir) if args.legislators or args.alldata: import_legislators(args.abbreviation, data_dir) if args.bills or args.alldata: import_bills(args.abbreviation, data_dir) if args.committees or args.alldata: import_committees(args.abbreviation, data_dir) # events currently excluded from --alldata if args.events: import_events(args.abbreviation, data_dir)