예제 #1
0
def main():

    parser = argparse.ArgumentParser(
        description='Scrape data for single bill, saving data to disk.',
        parents=[base_arg_parser],
    )

    parser.add_argument('module', type=str, help='scraper module (eg. nc)')
    parser.add_argument('chamber', type=str, help='chamber for bill to scrape')
    parser.add_argument('session', type=str, help='session for bill to scrape')
    parser.add_argument('bill_id', type=str, help='bill_id to scrape')

    parser.add_argument('--strict',
                        action='store_true',
                        dest='strict',
                        default=False,
                        help="fail immediately when"
                        "encountering validation warning")
    parser.add_argument('-n',
                        '--no_cache',
                        action='store_true',
                        dest='no_cache',
                        help="don't use web page cache")
    parser.add_argument('--fastmode',
                        help="scrape in fast mode",
                        action="store_true",
                        default=False)
    parser.add_argument('-r',
                        '--rpm',
                        action='store',
                        type=int,
                        dest='rpm',
                        default=60),
    parser.add_argument('--import',
                        dest='do_import',
                        help="import bill after scrape",
                        action="store_true",
                        default=False)

    args = parser.parse_args()

    settings.update(args)

    # set up search path
    sys.path.insert(
        0, os.path.join(os.path.dirname(__file__), '../../openstates'))

    # get metadata
    metadata = __import__(args.module, fromlist=['metadata']).metadata
    abbr = metadata['abbreviation']

    # configure logger
    configure_logging(args.verbose, abbr)

    args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbr)

    _run_scraper(args, metadata)

    if args.do_import:
        import_bills(abbr, settings.BILLY_DATA_DIR)
예제 #2
0
파일: dump_csv.py 프로젝트: PamelaM/billy
def main():
    import argparse

    parser = argparse.ArgumentParser(
        description=('Dump data to a set of CSV files, optionally uploading to'
                     ' S3 when done.'),
        parents=[base_arg_parser],
    )
    parser.add_argument('abbrs', metavar='ABBR', type=str, nargs='+',
                help=('the two-letter abbreviation for the data to export'))
    parser.add_argument('--file', '-f',
                        help='filename to output to (defaults to <abbr>.zip)')
    parser.add_argument('--nozip', action='store_true', default=False,
                        help="don't zip the files")
    parser.add_argument('--upload', '-u', action='store_true', default=False,
                        help='upload the created archive to S3')

    args = parser.parse_args()

    settings.update(args)

    for abbr in args.abbrs:
        print 'dumping CSV for', abbr

        if not args.file:
            args.file = '{0}_csv.zip'.format(abbr)

        dump_csv(abbr, args.file, args.nozip)

        if args.upload:
            if args.nozip:
                raise Warning('Unable to --upload if --nozip is specified')
            else:
                upload(abbr, args.file)
예제 #3
0
def main():
    parser = argparse.ArgumentParser(
        description='apply subject categorization for bills',
        parents=[base_arg_parser],
        conflict_handler='resolve',
    )

    default_dir = os.path.join(os.path.dirname(__file__),
                               '../../manual_data/subjects')

    parser.add_argument('abbr',
                        type=str,
                        help='abbreviation for data to process')
    parser.add_argument('--all',
                        help='update all sessions',
                        action='store_true',
                        default=False)
    parser.add_argument('-d',
                        '--data_dir',
                        help='directory of subject csvs',
                        dest='data_dir',
                        default=default_dir)
    args = parser.parse_args()

    settings.update(args)

    categorize_subjects(args.abbr, args.data_dir, args.all)
예제 #4
0
def main():
    parser = argparse.ArgumentParser(description="set a legislators term end_date", parents=[base_arg_parser])

    parser.add_argument("leg_id", type=str, help="id of legislator to retire")
    parser.add_argument("date", type=str, help="YYYY-MM-DD date to set for legislator end_date")

    args = parser.parse_args()

    settings.update(args)

    retire_legislator(args.leg_id, args.date)
예제 #5
0
def main():
    import sys
    import argparse

    parser = argparse.ArgumentParser(description='send bill versions to oyster',
                                     parents=[base_arg_parser])
    parser.add_argument('states', nargs='+', help='states to oysterize')
    args = parser.parse_args()
    settings.update(args)

    for state in args.states:
        print "Oysterizing %s bill versions" % state
        oysterize_versions(state)
예제 #6
0
def main():
    import sys
    import argparse

    parser = argparse.ArgumentParser(description='send bill versions to oyster',
                                     parents=[base_arg_parser])
    parser.add_argument('states', nargs='+', help='states to oysterize')
    args = parser.parse_args()
    settings.update(args)

    for state in args.states:
        print "Oysterizing %s bill versions" % state
        oysterize_versions(state)
예제 #7
0
def main():
    parser = argparse.ArgumentParser(
        description='Import scraped data into database.',
        parents=[base_arg_parser],
    )

    parser.add_argument('abbreviation', type=str,
                        help=('the short name of the data to import'))
    parser.add_argument('-r', '--rpm', type=int, default=60,
                        help=('maximum number of documents to download '
                              'per minute'))
    parser.add_argument('--bills', action='store_true',
                        help='scrape bill data')
    parser.add_argument('--legislators', action='store_true',
                        help='scrape legislator data')
    parser.add_argument('--committees', action='store_true',
                        help='scrape (separate) committee data')
    parser.add_argument('--events', action='store_true',
                        help='scrape event data')
    parser.add_argument('--alldata', action='store_true', dest='alldata',
                        default=False, help="import all available data")

    args = parser.parse_args()

    if not (args.bills or args.legislators or args.committees or
            args.events or args.alldata):
        raise Exception("Must specify at least one type: --bills, "
                           "--legislators, --committees, --events, "
                           "--alldata")

    settings.update(args)

    data_dir = settings.BILLY_DATA_DIR

    # configure logger
    configure_logging(args.verbose, args.abbreviation)

    # always import metadata
    import_metadata(args.abbreviation, data_dir)

    if args.legislators or args.alldata:
        import_legislators(args.abbreviation, data_dir)
    if args.bills or args.alldata:
        import_bills(args.abbreviation, data_dir)
    if args.committees or args.alldata:
        import_committees(args.abbreviation, data_dir)

    # events currently excluded from --alldata
    if args.events:
        import_events(args.abbreviation, data_dir)
예제 #8
0
def main():
    parser = argparse.ArgumentParser(
        description='run name matching against a session',
        parents=[base_arg_parser],
    )

    parser.add_argument('abbr', help='abbr to run matching for')
    parser.add_argument('term', help='term to run matching for')

    args = parser.parse_args()

    settings.update(args)

    match_names(args.abbr, args.term)
예제 #9
0
def main():
    parser = argparse.ArgumentParser(
        description='run name matching against a session',
        parents=[base_arg_parser],
    )

    parser.add_argument('abbr', help='abbr to run matching for')
    parser.add_argument('term', help='term to run matching for')

    args = parser.parse_args()

    settings.update(args)

    match_names(args.abbr, args.term)
예제 #10
0
def main():
    import argparse

    configure_logging(1)

    parser = argparse.ArgumentParser(
        description=('Dump API information to a zipped directory of JSON files'
                     ', optionally uploading to S3 when done.'),
        parents=[base_arg_parser],
    )
    parser.add_argument(
        'abbrs',
        metavar='ABBR',
        type=str,
        nargs='+',
        help=('the two-letter abbreviation for the data to export'))
    parser.add_argument('--file',
                        '-f',
                        help='filename to output to (defaults to <abbr>.zip)')
    parser.add_argument('--schema_dir',
                        help='directory to use for API schemas (optional)',
                        default=None)
    parser.add_argument('--nodump',
                        action='store_true',
                        default=False,
                        help="don't run the dump, only upload")
    parser.add_argument('--novalidate',
                        action='store_true',
                        default=False,
                        help="don't run validation")
    parser.add_argument('--upload',
                        '-u',
                        action='store_true',
                        default=False,
                        help='upload the created archive to S3')

    args = parser.parse_args()

    settings.update(args)

    for abbr in args.abbrs:
        if not args.file:
            args.file = abbr + '.zip'

        if not args.nodump:
            dump_json(abbr, args.file, not args.novalidate, args.schema_dir)

        if args.upload:
            upload(abbr, args.file)
예제 #11
0
def main():

    parser = argparse.ArgumentParser(
        description='Scrape data for state, saving data to disk.',
        parents=[base_arg_parser],
    )

    parser.add_argument('state', type=str,
                        help='state scraper module (eg. nc)')
    parser.add_argument('chamber', type=str,
                        help='chamber for bill to scrape')
    parser.add_argument('session', type=str,
                        help='session for bill to scrape')
    parser.add_argument('bill_id', type=str,
                        help='bill_id to scrape')
    parser.add_argument('--strict', action='store_true', dest='strict',
                        default=False, help="fail immediately when"
                        "encountering validation warning")
    parser.add_argument('-n', '--no_cache', action='store_true',
                        dest='no_cache', help="don't use web page cache")
    parser.add_argument('--fastmode', help="scrape in fast mode",
                        action="store_true", default=False)
    parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm',
                        default=60),
    parser.add_argument('--import', dest='do_import',
                        help="import bill after scrape",
                        action="store_true", default=False)

    args = parser.parse_args()

    settings.update(args)

    # set up search path
    sys.path.insert(0, os.path.join(os.path.dirname(__file__),
                                    '../../openstates'))

    # get metadata
    metadata = __import__(args.state, fromlist=['metadata']).metadata
    state = metadata['abbreviation']

    # configure logger
    configure_logging(args.verbose, state)

    args.output_dir = os.path.join(settings.BILLY_DATA_DIR, args.state)

    _run_scraper(args.state, state, args, metadata)

    if args.do_import:
        import_bills(args.state, settings.BILLY_DATA_DIR)
예제 #12
0
def main():
    parser = argparse.ArgumentParser(
        description='Populate database with district information.',
        parents=[base_arg_parser],
    )

    parser.add_argument('files', metavar='DISTRICT-FILE', type=str, nargs='+',
                help=('the path to the district CSV file to import'))

    args = parser.parse_args()

    settings.update(args)

    for file in args.files:
        import_district_csv(file)
예제 #13
0
def main():
    parser = argparse.ArgumentParser(
        description="apply subject categorization for bills", parents=[base_arg_parser], conflict_handler="resolve"
    )

    default_dir = os.path.join(os.path.dirname(__file__), "../../manual_data/subjects")

    parser.add_argument("abbr", type=str, help="abbreviation for data to process")
    parser.add_argument("--all", help="update all sessions", action="store_true", default=False)
    parser.add_argument("-d", "--data_dir", help="directory of subject csvs", dest="data_dir", default=default_dir)
    args = parser.parse_args()

    settings.update(args)

    categorize_subjects(args.abbr, args.data_dir, args.all)
예제 #14
0
def main():
    parser = argparse.ArgumentParser(
        description='apply subject categorization for bills',
        parents=[base_arg_parser],
        conflict_handler='resolve',
    )

    parser.add_argument('abbr', type=str, help='abbreviation for data to process')
    parser.add_argument('--all', help='update all sessions',
                        action='store_true', default=False)
    args = parser.parse_args()

    settings.update(args)

    categorize_subjects(args.abbr, args.all)
예제 #15
0
def main():
    parser = argparse.ArgumentParser(
        description="dump a CSV of missing leg_id's",
        parents=[base_arg_parser],
    )
    parser.add_argument('abbrs', metavar='ABBR', type=str, nargs='+',
                        help='data abbreviations to dump')
    parser.add_argument('--detailed', action='store_true', default=False,
                        help='print detailed csvs as well')
    args = parser.parse_args()

    settings.update(args)

    for abbr in args.abbrs:
        dump_missing_leg_ids(abbr, args.detailed)
예제 #16
0
def main():
    try:
        parser = argparse.ArgumentParser(
            description="Scrape data for single bill, saving data to disk.", parents=[base_arg_parser]
        )

        parser.add_argument("module", type=str, help="scraper module (eg. nc)")
        parser.add_argument("chamber", type=str, help="chamber for bill to scrape")
        parser.add_argument("session", type=str, help="session for bill to scrape")
        parser.add_argument("bill_id", type=str, help="bill_id to scrape")

        parser.add_argument(
            "--strict",
            action="store_true",
            dest="strict",
            default=False,
            help="fail immediately when" "encountering validation warning",
        )
        parser.add_argument("-n", "--no_cache", action="store_true", dest="no_cache", help="don't use web page cache")
        parser.add_argument("--fastmode", help="scrape in fast mode", action="store_true", default=False)
        parser.add_argument("-r", "--rpm", action="store", type=int, dest="rpm", default=60),
        parser.add_argument(
            "--import", dest="do_import", help="import bill after scrape", action="store_true", default=False
        )

        args = parser.parse_args()

        settings.update(args)

        # set up search path
        sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../openstates"))

        # get metadata
        metadata = __import__(args.module, fromlist=["metadata"]).metadata
        abbr = metadata["abbreviation"]

        # configure logger
        configure_logging(args.verbose, abbr)

        args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbr)

        _run_scraper(args, metadata)

        if args.do_import:
            import_bills(abbr, settings.BILLY_DATA_DIR)
    except ScrapeError as e:
        print "Error:", e
        sys.exit(1)
예제 #17
0
def main():
    import sys
    import argparse

    parser = argparse.ArgumentParser(description='validate API results',
                                     parents=[base_arg_parser])
    parser.add_argument('states', nargs='+', help='states to validate')
    parser.add_argument('--schema_dir',
                        help='directory to use for API schemas (optional)',
                        default=None)
    args = parser.parse_args()
    settings.update(args)

    for state in args.states:
        print "Validating %s" % state
        validate_api(state, args.schema_dir)
예제 #18
0
def main():
    parser = argparse.ArgumentParser(
        description='set a legislators term end_date',
        parents=[base_arg_parser],
    )

    parser.add_argument('leg_id', type=str,
                        help='id of legislator to retire')
    parser.add_argument('date', type=str,
                        help='YYYY-MM-DD date to set for legislator end_date')

    args = parser.parse_args()

    settings.update(args)

    retire_legislator(args.leg_id, args.date)
예제 #19
0
def main():
    parser = argparse.ArgumentParser(
        description='Convert state bills to SFM-ready text',
        parents=[base_arg_parser],
    )
    parser.add_argument('state', type=str, help='state')
    parser.add_argument('--sfm_server', type=str, help='URL of SFM instance',
                        default='http://localhost:8080/')

    args = parser.parse_args()

    settings.update(args)

    configure_logging(args.verbose, args.state)

    process_state_files(args.state, args.sfm_server)
예제 #20
0
def main():
    import sys
    import argparse

    parser = argparse.ArgumentParser(description='validate API results',
                                     parents=[base_arg_parser])
    parser.add_argument('states', nargs='+', help='states to validate')
    parser.add_argument('--schema_dir',
                        help='directory to use for API schemas (optional)',
                        default=None)
    args = parser.parse_args()
    settings.update(args)

    for state in args.states:
        print "Validating %s" % state
        validate_api(state, args.schema_dir)
예제 #21
0
def main():
    try:
        parser = argparse.ArgumentParser(
            description='Scrape data for single bill, saving data to disk.',
            parents=[base_arg_parser],
        )

        parser.add_argument('module', type=str, help='scraper module (eg. nc)')
        parser.add_argument('chamber', type=str,
                            help='chamber for bill to scrape')
        parser.add_argument('session', type=str,
                            help='session for bill to scrape')
        parser.add_argument('bill_id', type=str, help='bill_id to scrape')

        parser.add_argument('--strict', action='store_true', dest='strict',
                            default=False, help="fail immediately when"
                            "encountering validation warning")
        parser.add_argument('-n', '--no_cache', action='store_true',
                            dest='no_cache', help="don't use web page cache")
        parser.add_argument('--fastmode', help="scrape in fast mode",
                            action="store_true", default=False)
        parser.add_argument('-r', '--rpm', action='store', type=int,
                            dest='rpm', default=60),
        parser.add_argument('--import', dest='do_import',
                            help="import bill after scrape",
                            action="store_true", default=False)

        args = parser.parse_args()

        settings.update(args)

        # get metadata
        metadata = __import__(args.module, fromlist=['metadata']).metadata
        abbr = metadata['abbreviation']

        # configure logger
        configure_logging(args.verbose, abbr)

        args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbr)

        _run_scraper(args, metadata)

        if args.do_import:
            import_bills(abbr, settings.BILLY_DATA_DIR)
    except ScrapeError as e:
        print 'Error:', e
        sys.exit(1)
예제 #22
0
def main():
    parser = argparse.ArgumentParser(
        description='load a CSV of legislator data',
        parents=[base_arg_parser],
    )

    parser.add_argument('files', metavar='FILE', type=str, nargs='+',
                help='filenames to import')
    parser.add_argument('--save', action='store_true', default=False,
                        help='save changes to database (default is dry run)')

    args = parser.parse_args()

    settings.update(args)

    for file in args.files:
        process_file(file, args.save)
예제 #23
0
def main():
    parser = argparse.ArgumentParser(
        description='Populate database with district information.',
        parents=[base_arg_parser],
    )

    parser.add_argument('files',
                        metavar='DISTRICT-FILE',
                        type=str,
                        nargs='+',
                        help=('the path to the district CSV file to import'))

    args = parser.parse_args()

    settings.update(args)

    for file in args.files:
        import_district_csv(file)
예제 #24
0
def main():
    parser = argparse.ArgumentParser(
        description='Convert state bills to SFM-ready text',
        parents=[base_arg_parser],
    )
    parser.add_argument('state', type=str, help='state')
    parser.add_argument('--sfm_server',
                        type=str,
                        help='URL of SFM instance',
                        default='http://localhost:8080/')

    args = parser.parse_args()

    settings.update(args)

    configure_logging(args.verbose, args.state)

    process_state_files(args.state, args.sfm_server)
예제 #25
0
def main():
    import argparse

    parser = argparse.ArgumentParser(
        description=('Dump data to a set of CSV files, optionally uploading to'
                     ' S3 when done.'),
        parents=[base_arg_parser],
    )
    parser.add_argument(
        'abbrs',
        metavar='ABBR',
        type=str,
        nargs='+',
        help=('the two-letter abbreviation for the data to export'))
    parser.add_argument('--file',
                        '-f',
                        help='filename to output to (defaults to <abbr>.zip)')
    parser.add_argument('--nozip',
                        action='store_true',
                        default=False,
                        help="don't zip the files")
    parser.add_argument('--upload',
                        '-u',
                        action='store_true',
                        default=False,
                        help='upload the created archive to S3')

    args = parser.parse_args()

    settings.update(args)

    for abbr in args.abbrs:
        print 'dumping CSV for', abbr

        if not args.file:
            args.file = '{0}_csv.zip'.format(abbr)

        dump_csv(abbr, args.file, args.nozip)

        if args.upload:
            if args.nozip:
                raise Warning('Unable to --upload if --nozip is specified')
            else:
                upload(abbr, args.file)
예제 #26
0
파일: util.py 프로젝트: annerajb/billy
def main():
    parser = argparse.ArgumentParser(description='generic billy util',
                                     parents=[base_arg_parser])
    subparsers = parser.add_subparsers(dest='subcommand')

    # import command plugins
    for mod in COMMAND_MODULES:
        import_command_module(mod)

    # instantiate all subcommands
    subcommands = {}
    for SubcommandCls in BaseCommand.subcommands:
        subcommands[SubcommandCls.name] = SubcommandCls(subparsers)

    # parse arguments, update settings, then run the appropriate function
    args = parser.parse_args()
    settings.update(args)
    configure_logging(args.subcommand)
    subcommands[args.subcommand].handle(args)
예제 #27
0
def main():
    parser = argparse.ArgumentParser(description='generic billy util',
                                     parents=[base_arg_parser])
    subparsers = parser.add_subparsers(dest='subcommand')

    # import command plugins
    for mod in COMMAND_MODULES:
        import_command_module(mod)

    # instantiate all subcommands
    subcommands = {}
    for SubcommandCls in BaseCommand.subcommands:
        subcommands[SubcommandCls.name] = SubcommandCls(subparsers)

    # parse arguments, update settings, then run the appropriate function
    args = parser.parse_args()
    settings.update(args)
    configure_logging(args.subcommand)
    subcommands[args.subcommand].handle(args)
예제 #28
0
def main():
    parser = argparse.ArgumentParser(
        description="dump a CSV of missing leg_id's",
        parents=[base_arg_parser],
    )
    parser.add_argument('abbrs',
                        metavar='ABBR',
                        type=str,
                        nargs='+',
                        help='data abbreviations to dump')
    parser.add_argument('--detailed',
                        action='store_true',
                        default=False,
                        help='print detailed csvs as well')
    args = parser.parse_args()

    settings.update(args)

    for abbr in args.abbrs:
        dump_missing_leg_ids(abbr, args.detailed)
예제 #29
0
def main():
    parser = argparse.ArgumentParser(
        description='load a CSV of legislator data',
        parents=[base_arg_parser],
    )

    parser.add_argument('files',
                        metavar='FILE',
                        type=str,
                        nargs='+',
                        help='filenames to import')
    parser.add_argument('--save',
                        action='store_true',
                        default=False,
                        help='save changes to database (default is dry run)')

    args = parser.parse_args()

    settings.update(args)

    for file in args.files:
        process_file(file, args.save)
예제 #30
0
def main():
    parser = argparse.ArgumentParser(
        description=('attempt to match legislators with ids in other'
                     'relevant APIs'),
        parents=[base_arg_parser],
    )

    parser.add_argument('abbrs',
                        metavar='ABBR',
                        type=str,
                        nargs='+',
                        help='abbreviations for data to update')

    args = parser.parse_args()

    settings.update(args)

    votesmart.apikey = settings.VOTESMART_API_KEY

    for abbr in args.abbrs:
        update_missing_ids(abbr, settings.SUNLIGHT_SERVICES_KEY)
        time.sleep(30)
예제 #31
0
파일: dump_json.py 프로젝트: PamelaM/billy
def main():
    import argparse

    configure_logging(1)

    parser = argparse.ArgumentParser(
        description=('Dump API information to a zipped directory of JSON files'
                     ', optionally uploading to S3 when done.'),
        parents=[base_arg_parser],
    )
    parser.add_argument('abbrs', metavar='ABBR', type=str, nargs='+',
                help=('the two-letter abbreviation for the data to export'))
    parser.add_argument('--file', '-f',
                        help='filename to output to (defaults to <abbr>.zip)')
    parser.add_argument('--schema_dir',
                        help='directory to use for API schemas (optional)',
                        default=None)
    parser.add_argument('--nodump', action='store_true', default=False,
                        help="don't run the dump, only upload")
    parser.add_argument('--novalidate', action='store_true', default=False,
                        help="don't run validation")
    parser.add_argument('--upload', '-u', action='store_true', default=False,
                        help='upload the created archive to S3')

    args = parser.parse_args()

    settings.update(args)

    for abbr in args.abbrs:
        if not args.file:
            args.file = abbr + '.zip'

        if not args.nodump:
            dump_json(abbr, args.file, not args.novalidate, args.schema_dir)

        if args.upload:
            upload(abbr, args.file)
예제 #32
0
파일: scrape.py 프로젝트: addamh/openstates
def main():

    parser = argparse.ArgumentParser(
        description='Scrape data for state, saving data to disk.',
        parents=[base_arg_parser],
    )

    parser.add_argument('state', type=str,
                        help='state scraper module (eg. nc)')
    parser.add_argument('-s', '--session', action='append', dest='sessions',
                        help='session(s) to scrape')
    parser.add_argument('-t', '--term', action='append', dest='terms',
                        help='term(s) to scrape')
    parser.add_argument('--upper', action='store_true', dest='upper',
                        default=False, help='scrape upper chamber')
    parser.add_argument('--lower', action='store_true', dest='lower',
                        default=False, help='scrape lower chamber')
    parser.add_argument('--bills', action='store_true', dest='bills',
                        default=False, help="scrape bill data")
    parser.add_argument('--legislators', action='store_true',
                        dest='legislators', default=False,
                        help="scrape legislator data")
    parser.add_argument('--committees', action='store_true', dest='committees',
                        default=False, help="scrape committee data")
    parser.add_argument('--votes', action='store_true', dest='votes',
                        default=False, help="scrape vote data")
    parser.add_argument('--events', action='store_true', dest='events',
                        default=False, help='scrape event data')
    parser.add_argument('--alldata', action='store_true', dest='alldata',
                        default=False,
                        help="scrape all available types of data")
    parser.add_argument('--strict', action='store_true', dest='strict',
                        default=False, help="fail immediately when"
                        "encountering validation warning")
    parser.add_argument('-n', '--no_cache', action='store_true',
                        dest='no_cache', help="don't use web page cache")
    parser.add_argument('--fastmode', help="scrape in fast mode",
                        action="store_true", default=False)
    parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm',
                        default=60),
    parser.add_argument('--timeout', action='store', type=int, dest='timeout',
                        default=10)

    args = parser.parse_args()

    settings.update(args)

    # set up search path
    sys.path.insert(0, os.path.join(os.path.dirname(__file__),
                                    '../../openstates'))

    # get metadata
    metadata = __import__(args.state, fromlist=['metadata']).metadata
    state = metadata['abbreviation']

    configure_logging(args.verbose, args.state)

    # make output dir
    args.output_dir = os.path.join(settings.BILLY_DATA_DIR, args.state)
    try:
        os.makedirs(args.output_dir)
    except OSError as e:
        if e.errno != 17:
            raise e

    # write metadata
    try:
        schema_path = os.path.join(os.path.split(__file__)[0],
                                   '../schemas/metadata.json')
        schema = json.load(open(schema_path))

        validator = DatetimeValidator()
        validator.validate(metadata, schema)
    except ValueError as e:
        logging.getLogger('billy').warning('metadata validation error: '
                                                 + str(e))

    with open(os.path.join(args.output_dir, 'state_metadata.json'), 'w') as f:
        json.dump(metadata, f, cls=JSONDateEncoder)

    # determine time period to run for
    if args.terms:
        for term in metadata['terms']:
            if term in args.terms:
                args.sessions.extend(term['sessions'])
    args.sessions = set(args.sessions or [])

    # determine chambers
    args.chambers = []
    if args.upper:
        args.chambers.append('upper')
    if args.lower:
        args.chambers.append('lower')
    if not args.chambers:
        args.chambers = ['upper', 'lower']

    if not (args.bills or args.legislators or args.votes or
            args.committees or args.events or args.alldata):
        raise ScrapeError("Must specify at least one of --bills, "
                          "--legislators, --committees, --votes, --events, "
                          "--alldata")

    if args.alldata:
        args.bills = True
        args.legislators = True
        args.votes = True
        args.committees = True

    if args.bills:
        _run_scraper(args.state, state, 'bills', args, metadata)
    if args.legislators:
        _run_scraper(args.state, state, 'legislators', args, metadata)
    if args.committees:
        _run_scraper(args.state, state, 'committees', args, metadata)
    if args.votes:
        _run_scraper(args.state, state, 'votes', args, metadata)
    if args.events:
        _run_scraper(args.state, state, 'events', args, metadata)
예제 #33
0
def main():

    parser = argparse.ArgumentParser(
        description='Scrape data for state, saving data to disk.',
        parents=[base_arg_parser],
    )

    parser.add_argument('state', type=str,
                        help='state scraper module (eg. nc)')
    parser.add_argument('-s', '--session', action='append', dest='sessions',
                        help='session(s) to scrape')
    parser.add_argument('-t', '--term', action='append', dest='terms',
                        help='term(s) to scrape')
    parser.add_argument('--upper', action='store_true', dest='upper',
                        default=False, help='scrape upper chamber')
    parser.add_argument('--lower', action='store_true', dest='lower',
                        default=False, help='scrape lower chamber')
    parser.add_argument('--bills', action='store_true', dest='bills',
                        default=False, help="scrape bill data")
    parser.add_argument('--legislators', action='store_true',
                        dest='legislators', default=False,
                        help="scrape legislator data")
    parser.add_argument('--committees', action='store_true', dest='committees',
                        default=False, help="scrape committee data")
    parser.add_argument('--votes', action='store_true', dest='votes',
                        default=False, help="scrape vote data")
    parser.add_argument('--events', action='store_true', dest='events',
                        default=False, help='scrape event data')
    parser.add_argument('--alldata', action='store_true', dest='alldata',
                        default=False,
                        help="scrape all available types of data")
    parser.add_argument('--strict', action='store_true', dest='strict',
                        default=False, help="fail immediately when"
                        "encountering validation warning")
    parser.add_argument('-n', '--no_cache', action='store_true',
                        dest='no_cache', help="don't use web page cache")
    parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm',
                        default=60),

    args = parser.parse_args()

    settings.update(args)

    # set up search path
    sys.path.insert(0, os.path.join(os.path.dirname(__file__),
                                    '../../openstates'))

    # get metadata
    metadata = __import__(args.state, fromlist=['metadata']).metadata
    state = metadata['abbreviation']

    # configure logger
    if args.verbose == 0:
        verbosity = logging.WARNING
    elif args.verbose == 1:
        verbosity = logging.INFO
    else:
        verbosity = logging.DEBUG

    logging.basicConfig(level=verbosity,
                        format="%(asctime)s %(name)s %(levelname)s " + state +
                               " %(message)s",
                        datefmt="%H:%M:%S",
                       )

    # make output dir
    args.output_dir = os.path.join(settings.BILLY_DATA_DIR, state)
    try:
        os.makedirs(args.output_dir)
    except OSError, e:
        if e.errno != 17:
            raise e
예제 #34
0
def main(old_scrape_compat=False):
    try:
        parser = argparse.ArgumentParser(
            description='update billy data',
            parents=[base_arg_parser],
        )

        what = parser.add_argument_group(
            'what to scrape', 'flags that help select what data to scrape')
        scrape = parser.add_argument_group('scraper config',
                                           'settings for the scraper')

        parser.add_argument('module', type=str, help='scraper module (eg. nc)')
        what.add_argument('-s',
                          '--session',
                          action='append',
                          dest='sessions',
                          default=[],
                          help='session(s) to scrape')
        what.add_argument('-t',
                          '--term',
                          action='append',
                          dest='terms',
                          help='term(s) to scrape',
                          default=[])

        for arg in ('upper', 'lower'):
            what.add_argument('--' + arg,
                              action='append_const',
                              dest='chambers',
                              const=arg)
        for arg in ('bills', 'legislators', 'committees', 'votes', 'events'):
            what.add_argument('--' + arg,
                              action='append_const',
                              dest='types',
                              const=arg)
        for arg in ('scrape', 'import', 'report'):
            parser.add_argument('--' + arg,
                                dest='actions',
                                action="append_const",
                                const=arg,
                                help='only run %s step' % arg)

        # special modes for debugging
        scrape.add_argument('--nonstrict',
                            action='store_false',
                            dest='strict',
                            default=True,
                            help="don't fail immediately when"
                            " encountering validation warning")
        scrape.add_argument('--fastmode',
                            help="scrape in fast mode",
                            action="store_true",
                            default=False)

        # scrapelib overrides
        scrape.add_argument('-r',
                            '--rpm',
                            action='store',
                            type=int,
                            dest='SCRAPELIB_RPM')
        scrape.add_argument('--timeout',
                            action='store',
                            type=int,
                            dest='SCRAPELIB_TIMEOUT')
        scrape.add_argument('--retries',
                            type=int,
                            dest='SCRAPELIB_RETRY_ATTEMPTS')
        scrape.add_argument('--retry_wait',
                            type=int,
                            dest='SCRAPELIB_RETRY_WAIT_SECONDS')

        args = parser.parse_args()

        # inject scraper paths so scraper module can be found
        for newpath in settings.SCRAPER_PATHS:
            sys.path.insert(0, newpath)

        # get metadata
        module = __import__(args.module)
        metadata = module.metadata
        module_settings = getattr(module, 'settings', {})
        abbrev = metadata['abbreviation']

        # load state settings, then command line settings
        settings.update(module_settings)
        settings.update(args)

        configure_logging(args.module)

        # configure oyster
        if settings.ENABLE_OYSTER:
            from oyster.conf import settings as oyster_settings
            oyster_settings.DOCUMENT_CLASSES[
                args.module + ':billtext'] = module.document_class

        # make output dir
        args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev)
        _clear_scraped_data(args.output_dir)

        # if terms aren't set, use latest
        if not args.terms:
            if args.sessions:
                for session in args.sessions:
                    args.terms.append(
                        term_for_session(metadata['abbreviation'], session,
                                         metadata))
                args.terms = list(set(args.terms or []))
            else:
                latest_term = metadata['terms'][-1]['name']
                args.terms = [latest_term]
        # only set sessions from terms if sessions weren't set
        elif not args.sessions:
            for term in metadata['terms']:
                if term['name'] in args.terms:
                    args.sessions.extend(term['sessions'])
            # dedup sessions
            args.sessions = list(set(args.sessions or []))

        if not args.sessions:
            args.sessions = [metadata['terms'][-1]['sessions'][-1]]

        # determine chambers
        if not args.chambers:
            args.chambers = ['upper', 'lower']

        if not args.actions:
            if old_scrape_compat:
                args.actions = ['scrape']
            else:
                args.actions = ['scrape', 'import', 'report']

        if not args.types:
            args.types = [
                'bills', 'legislators', 'votes', 'committees', 'alldata'
            ]
            if 'events' in metadata['feature_flags']:
                args.types.append('events')

        plan = """billy-update abbr=%s
    actions=%s
    types=%s
    sessions=%s
    terms=%s""" % (args.module, ','.join(args.actions), ','.join(
            args.types), ','.join(args.sessions), ','.join(args.terms))
        logging.getLogger('billy').info(plan)

        scrape_data = {}

        if 'scrape' in args.actions:
            # validate then write metadata
            if hasattr(module, 'session_list'):
                session_list = module.session_list()
            else:
                session_list = []
            check_sessions(metadata, session_list)

            try:
                schema_path = os.path.join(
                    os.path.split(__file__)[0], '../schemas/metadata.json')
                schema = json.load(open(schema_path))

                validator = DatetimeValidator()
                validator.validate(metadata, schema)
            except ValueError as e:
                logging.getLogger('billy').warning(
                    'metadata validation error: ' + str(e))

            with open(os.path.join(args.output_dir, 'metadata.json'),
                      'w') as f:
                json.dump(metadata, f, cls=JSONDateEncoder)

            run_record = []
            exec_record = {
                "run_record": run_record,
                "args": sys.argv,
                "state": abbrev
            }

            lex = None
            exc_traceback = None

            # start to run scrapers
            exec_start = dt.datetime.utcnow()

            # scraper order matters
            order = ('legislators', 'committees', 'votes', 'bills', 'events')
            try:
                for stype in order:
                    if stype in args.types:
                        run_record += _run_scraper(stype, args, metadata)
            except Exception as e:
                _traceback = _, _, exc_traceback = sys.exc_info()
                run_record += [{"exception": e, "type": stype}]
                lex = e

            exec_end = dt.datetime.utcnow()
            exec_record['started'] = exec_start
            exec_record['ended'] = exec_end
            scrape_data['scraped'] = exec_record
            scrape_data['state'] = abbrev

            for record in run_record:
                if "exception" in record:
                    ex = record['exception']
                    fb = traceback.format_exception(*_traceback)
                    trace = ""
                    for t in fb:
                        trace += t
                    record['exception'] = {
                        "type": ex.__class__.__name__,
                        "message": ex.message,
                        'traceback': trace
                    }
                    scrape_data['failure'] = True
            if lex:
                if 'import' in args.actions:
                    try:
                        db.billy_runs.save(scrape_data, safe=True)
                    except Exception:
                        raise lex, None, exc_traceback
                        # XXX: This should *NEVER* happen, but it has
                        # in the past, so we're going to catch any errors
                        # writing # to pymongo, and raise the original
                        # exception rather then let it look like Mongo's fault.
                        # Thanks for catching this, Thom.
                        #
                        # We lose the stack trace, but the Exception is the
                        # same in every other way.
                        #  -- paultag
                raise

        # imports
        if 'import' in args.actions:
            import_report = _do_imports(abbrev, args)
            scrape_data['imported'] = import_report
            # We're tying the run-logging into the import stage - since import
            # already writes to the DB, we might as well throw this in too.
            db.billy_runs.save(scrape_data, safe=True)

        # reports
        if 'report' in args.actions:
            _do_reports(abbrev, args)

    except ScrapeError as e:
        print 'Error:', e
        sys.exit(1)
예제 #35
0
def main():

    parser = argparse.ArgumentParser(
        description='Scrape data for state, saving data to disk.',
        parents=[base_arg_parser],
    )

    parser.add_argument('state',
                        type=str,
                        help='state scraper module (eg. nc)')
    parser.add_argument('-s',
                        '--session',
                        action='append',
                        dest='sessions',
                        help='session(s) to scrape')
    parser.add_argument('-t',
                        '--term',
                        action='append',
                        dest='terms',
                        help='term(s) to scrape')
    parser.add_argument('--upper',
                        action='store_true',
                        dest='upper',
                        default=False,
                        help='scrape upper chamber')
    parser.add_argument('--lower',
                        action='store_true',
                        dest='lower',
                        default=False,
                        help='scrape lower chamber')
    parser.add_argument('--bills',
                        action='store_true',
                        dest='bills',
                        default=False,
                        help="scrape bill data")
    parser.add_argument('--legislators',
                        action='store_true',
                        dest='legislators',
                        default=False,
                        help="scrape legislator data")
    parser.add_argument('--committees',
                        action='store_true',
                        dest='committees',
                        default=False,
                        help="scrape committee data")
    parser.add_argument('--votes',
                        action='store_true',
                        dest='votes',
                        default=False,
                        help="scrape vote data")
    parser.add_argument('--events',
                        action='store_true',
                        dest='events',
                        default=False,
                        help='scrape event data')
    parser.add_argument('--alldata',
                        action='store_true',
                        dest='alldata',
                        default=False,
                        help="scrape all available types of data")
    parser.add_argument('--strict',
                        action='store_true',
                        dest='strict',
                        default=False,
                        help="fail immediately when"
                        "encountering validation warning")
    parser.add_argument('-n',
                        '--no_cache',
                        action='store_true',
                        dest='no_cache',
                        help="don't use web page cache")
    parser.add_argument('--fastmode',
                        help="scrape in fast mode",
                        action="store_true",
                        default=False)
    parser.add_argument('-r',
                        '--rpm',
                        action='store',
                        type=int,
                        dest='rpm',
                        default=60),

    args = parser.parse_args()

    settings.update(args)

    # set up search path
    sys.path.insert(
        0, os.path.join(os.path.dirname(__file__), '../../openstates'))

    # get metadata
    metadata = __import__(args.state, fromlist=['metadata']).metadata
    state = metadata['abbreviation']

    # configure logger
    if args.verbose == 0:
        verbosity = logging.WARNING
    elif args.verbose == 1:
        verbosity = logging.INFO
    else:
        verbosity = logging.DEBUG

    logging.basicConfig(
        level=verbosity,
        format="%(asctime)s %(name)s %(levelname)s " + state + " %(message)s",
        datefmt="%H:%M:%S",
    )

    # make output dir
    args.output_dir = os.path.join(settings.BILLY_DATA_DIR, state)
    try:
        os.makedirs(args.output_dir)
    except OSError, e:
        if e.errno != 17:
            raise e
예제 #36
0
def main():
    parser = argparse.ArgumentParser(
        description='Import scraped data into database.',
        parents=[base_arg_parser],
    )

    parser.add_argument('abbreviation',
                        type=str,
                        help=('the short name of the data to import'))
    parser.add_argument('-r',
                        '--rpm',
                        type=int,
                        default=60,
                        help=('maximum number of documents to download '
                              'per minute'))
    parser.add_argument('--bills',
                        action='store_true',
                        help='scrape bill data')
    parser.add_argument('--legislators',
                        action='store_true',
                        help='scrape legislator data')
    parser.add_argument('--committees',
                        action='store_true',
                        help='scrape (separate) committee data')
    parser.add_argument('--events',
                        action='store_true',
                        help='scrape event data')
    parser.add_argument('--alldata',
                        action='store_true',
                        dest='alldata',
                        default=False,
                        help="import all available data")

    args = parser.parse_args()

    if not (args.bills or args.legislators or args.committees or args.events
            or args.alldata):
        raise Exception("Must specify at least one type: --bills, "
                        "--legislators, --committees, --events, "
                        "--alldata")

    settings.update(args)

    data_dir = settings.BILLY_DATA_DIR

    # configure logger
    configure_logging(args.verbose, args.abbreviation)

    # always import metadata
    import_metadata(args.abbreviation, data_dir)

    if args.legislators or args.alldata:
        import_legislators(args.abbreviation, data_dir)
    if args.bills or args.alldata:
        import_bills(args.abbreviation, data_dir)
    if args.committees or args.alldata:
        import_committees(args.abbreviation, data_dir)

    # events currently excluded from --alldata
    if args.events:
        import_events(args.abbreviation, data_dir)
예제 #37
0
        print '%s,"%s"' % (n, category.encode('ascii', 'replace'))


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='apply subject categorization for bills',
        parents=[base_arg_parser],
        conflict_handler='resolve',
    )

    default_dir = os.path.join(os.path.dirname(__file__),
                               '../../manual_data/subjects')

    parser.add_argument('abbr',
                        type=str,
                        help='abbreviation for data to process')
    parser.add_argument('--all',
                        help='update all sessions',
                        action='store_true',
                        default=False)
    parser.add_argument('-d',
                        '--data_dir',
                        help='directory of subject csvs',
                        dest='data_dir',
                        default=default_dir)
    args = parser.parse_args()

    settings.update(args)

    categorize_subjects(args.abbr, args.data_dir, args.all)
예제 #38
0
파일: update.py 프로젝트: jmdupont/billy
def main(old_scrape_compat=False):
    try:
        parser = argparse.ArgumentParser(
            description='update billy data',
            parents=[base_arg_parser],
        )

        what = parser.add_argument_group(
            'what to scrape',
            'flags that help select what data to scrape')
        scrape = parser.add_argument_group(
            'scraper config',
            'settings for the scraper')

        parser.add_argument('module', type=str, help='scraper module (eg. nc)')
        parser.add_argument('--pdb', action='store_true', default=False,
                            help='invoke PDB when exception is raised')
        parser.add_argument('--ipdb', action='store_true', default=False,
                            help='invoke PDB when exception is raised')
        parser.add_argument('--pudb', action='store_true', default=False,
                            help='invoke PUDB when exception is raised')
        what.add_argument('-s', '--session', action='append',
                          dest='sessions', default=[],
                          help='session(s) to scrape')
        what.add_argument('-t', '--term', action='append', dest='terms',
                          help='term(s) to scrape', default=[])

        for arg in ('upper', 'lower'):
            what.add_argument('--' + arg, action='append_const',
                              dest='chambers', const=arg)

        for arg in (
                'bills',
                'legislators',
                'committees',
                'votes',
                'events',
                'speeches'):
            what.add_argument(
                '--' + arg,
                action='append_const',
                dest='types',
                const=arg)

        for arg in ('scrape', 'import', 'report'):
            parser.add_argument('--' + arg, dest='actions',
                                action="append_const", const=arg,
                                help='only run %s step' % arg)
        # special modes for debugging
        scrape.add_argument('--nonstrict', action='store_false', dest='strict',
                            default=True, help="don't fail immediately when"
                            " encountering validation warning")

        scrape.add_argument('--fastmode', help="scrape in fast mode",
                            action="store_true", default=False)

        scrape.add_argument('--billid', help="scrape only a single bill",
                            action="store", default=False)

        # scrapelib overrides
        scrape.add_argument('-r', '--rpm', action='store', type=int,
                            dest='SCRAPELIB_RPM')
        scrape.add_argument('--timeout', action='store', type=int,
                            dest='SCRAPELIB_TIMEOUT')
        scrape.add_argument('--retries', type=int,
                            dest='SCRAPELIB_RETRY_ATTEMPTS')
        scrape.add_argument('--retry_wait', type=int,
                            dest='SCRAPELIB_RETRY_WAIT_SECONDS')

        args = parser.parse_args()

        if args.pdb or args.pudb or args.ipdb:
            _debugger = pdb
            if args.pudb:
                try:
                    import pudb
                    _debugger = pudb
                except ImportError:
                    pass
            if args.ipdb:
                try:
                    import ipdb
                    _debugger = ipdb
                except ImportError:
                    pass

            # turn on PDB-on-error mode
            # stolen from http://stackoverflow.com/questions/1237379/
            # if this causes problems in interactive mode check that page
            def _tb_info(_type, value, tb):
                traceback.print_exception(_type, value, tb)
                _debugger.pm()
            sys.excepthook = _tb_info

        # inject scraper paths so scraper module can be found
        for newpath in settings.SCRAPER_PATHS:
            sys.path.insert(0, newpath)

        # get metadata
        module = importlib.import_module(args.module)
        metadata = module.metadata
        module_settings = getattr(module, 'settings', {})
        abbrev = metadata['abbreviation']

        # load module settings, then command line settings
        settings.update(module_settings)
        settings.update(args)

        # make output dir
        args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev)

        # if terms aren't set, use latest
        if not args.terms:
            if args.sessions:
                for session in args.sessions:
                    args.terms.append(
                        term_for_session(metadata['abbreviation'], session,
                                         metadata))
                args.terms = list(set(args.terms or []))
            else:
                latest_term = metadata['terms'][-1]['name']
                args.terms = [latest_term]
        # only set sessions from terms if sessions weren't set
        elif not args.sessions:
            for term in metadata['terms']:
                if term['name'] in args.terms:
                    args.sessions.extend(term['sessions'])
            # dedup sessions
            args.sessions = list(set(args.sessions or []))

        if not args.sessions:
            args.sessions = [metadata['terms'][-1]['sessions'][-1]]

        # determine chambers
        if not args.chambers:
            args.chambers = ['upper', 'lower']

        if not args.actions:
            args.actions = ['scrape', 'import', 'report']

        if not args.types:
            args.types = ['bills', 'legislators', 'votes', 'committees',
                          'alldata']

            if 'events' in metadata['feature_flags']:
                args.types.append('events')

            if 'speeches' in metadata['feature_flags']:
                args.types.append('speeches')

        plan = """billy-update abbr=%s
    actions=%s
    types=%s
    sessions=%s
    terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types),
                   ','.join(args.sessions), ','.join(args.terms))
        _log.info(plan)
        scrape_data = {}

        if args.billid is False:
            _log.debug("No billid filter.")
        else:
            _log.debug("Search for billid: %s" % args.billid)

        if 'scrape' in args.actions:
            _clear_scraped_data(args.output_dir)

            # validate then write metadata
            if hasattr(module, 'session_list'):
                session_list = module.session_list()
            else:
                session_list = []
            check_sessions(metadata, session_list)

            _log.debug("Session List %s" % session_list)
            try:
                schema_path = os.path.join(
                    os.path.split(__file__)[0],
                    '../schemas/metadata.json')
                schema = json.load(open(schema_path))
                validator = DatetimeValidator()
                validator.validate(metadata, schema)
            except ValueError as e:
                _log.warning(
                    'metadata validation error: ' + str(e))

            with open(os.path.join(args.output_dir, 'metadata.json'),
                      'w') as f:
                json.dump(metadata, f, cls=JSONDateEncoder)

            run_record = []
            exec_record = {
                "run_record": run_record,
                "args": sys.argv,
                "state": abbrev
            }

            lex = None
            exc_traceback = None

            # start to run scrapers
            exec_start = dt.datetime.utcnow()

            # scraper order matters
            if args.billid is False:
                order = (
                    'legislators',
                    'committees',
                    'votes',
                    'bills',
                    'events',
                    'speeches')
            else:
                _log.debug("going to process bills")
                order = ('bills',)  # only process the bills

            _traceback = None
            try:
                for stype in order:
                    _log.debug("consider to process %s" % stype)
                    if stype in args.types:
                        _log.debug("going to process %s" % stype)
                        scraper_results = _run_scraper(stype, args, metadata)

                        run_record += scraper_results
                    else:
                        _log.debug("skipping %s" % stype)

            except Exception as e:
                _traceback = _, _, exc_traceback = sys.exc_info()
                run_record += [{"exception": e, "type": stype}]
                lex = e

            exec_end = dt.datetime.utcnow()
            exec_record['started'] = exec_start
            exec_record['ended'] = exec_end
            scrape_data['scraped'] = exec_record
            scrape_data['abbr'] = abbrev

            for record in run_record:
                if "exception" in record:
                    ex = record['exception']
                    fb = traceback.format_exception(*_traceback)
                    trace = ""
                    for t in fb:
                        trace += t
                    record['exception'] = {
                        "type": ex.__class__.__name__,
                        "message": ex,
                        'traceback': trace
                    }
                    scrape_data['failure'] = True
            if lex:
                if 'import' in args.actions:
                    try:
                        _log.debug("scrape_data:")
                        if scrape_data['failure']:
                            _log.debug("Failed")
                            _log.debug(scrape_data)
                        else:
                            _log.debug("OK")
                            _log.debug(scrape_data)
                            db.billy_runs.save(scrape_data, safe=True)

                    except KeyError as e:
                        _log.debug("Caught exception1 :")
                        _log.debug(e)
                        exit(123)

                    except pymongo.errors.OperationFailure as e:
                        _log.debug("Caught exception3 :")
                        _log.debug(e)
                        exit(123)

                    except Exception as e:
                        _log.debug("Caught exception :")
                        _log.debug(e)
                        exit(123)
                        raise lex, None, exc_traceback
                        # XXX: This should *NEVER* happen, but it has
                        # in the past, so we're going to catch any errors
                        # writing # to pymongo, and raise the original
                        # exception rather then let it look like Mongo's fault.
                        # Thanks for catching this, Thom.
                        #
                        # We lose the stack trace, but the Exception is the
                        # same in every other way.
                        #  -- paultag
                raise

        # imports
        if 'import' in args.actions:
            import_report = _do_imports(abbrev, args)
            scrape_data['imported'] = import_report
            # We're tying the run-logging into the import stage - since import
            # already writes to the DB, we might as well throw this in too.
            _log.debug(scrape_data)
            db.billy_runs.save(scrape_data, safe=True)

        # reports
        if 'report' in args.actions:
            _do_reports(abbrev, args)

        if 'session-list' in args.actions:
            if hasattr(module, 'session_list'):
                print("\n".join(module.session_list()))
            else:
                raise ScrapeError('session_list() is not defined')

    except ScrapeError as e:
        _log.debug("in update.py Scrape error")
        _log.debug("Scrape error :%s" % e)
        _log.critical('Error: %s' % e)
        sys.exit(1)

    except TypeError as e:
        _log.debug("Type error")
        _log.critical('TypeError:', e)
        sys.exit(1)

    except NoData as e:
        _log.debug("No Data")
        _log.debug(e)
        _log.critical('No Data:')
        sys.exit(1)

    except NoDoc as e:
        _log.debug("No Doc")
        _log.critical('No Doc:', e)
        sys.exit(1)

    except NoXpath as e:
        _log.debug("No XPath")
        _log.critical('No XPath:', e)
        sys.exit(1)

    except Exception as e:
        _log.debug("Unknown error3")
        _log.debug(e)
        _log.critical('Unknown Error')
        sys.exit(1)
        sys.exit(1)
    else:
        print "Updating ids for {0}".format(abbr)

    print "Updating PVS legislator ids..."
    update_votesmart_legislators(meta)

    print "Updating TransparencyData ids..."
    update_transparencydata_legislators(meta, sunlight_key)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=('attempt to match legislators with ids in other'
                     'relevant APIs'),
        parents=[base_arg_parser],
    )

    parser.add_argument('abbrs', metavar='ABBR', type=str, nargs='+',
                        help='abbreviations for data to update')

    args = parser.parse_args()

    settings.update(args)

    votesmart.apikey = settings.VOTESMART_API_KEY

    for abbr in args.abbrs:
        update_missing_ids(abbr, settings.SUNLIGHT_SERVICES_KEY)
        time.sleep(30)
예제 #40
0
파일: update.py 프로젝트: msabramo/billy
def main(old_scrape_compat=False):
    try:
        parser = argparse.ArgumentParser(
          description='update billy data',
          parents=[base_arg_parser],
        )

        what = parser.add_argument_group('what to scrape',
                                 'flags that help select what data to scrape')
        scrape = parser.add_argument_group('scraper config',
                                 'settings for the scraper')

        parser.add_argument('module', type=str, help='scraper module (eg. nc)')
        what.add_argument('-s', '--session', action='append',
                            dest='sessions', default=[],
                          help='session(s) to scrape')
        what.add_argument('-t', '--term', action='append', dest='terms',
                            help='term(s) to scrape', default=[])

        for arg in ('upper', 'lower'):
            what.add_argument('--' + arg, action='append_const',
                              dest='chambers', const=arg)
        for arg in ('bills', 'legislators', 'committees', 'votes', 'events'):
            what.add_argument('--' + arg, action='append_const', dest='types',
                              const=arg)
        for arg in ('scrape', 'import', 'report'):
            parser.add_argument('--' + arg, dest='actions',
                                action="append_const", const=arg,
                                help='only run %s step' % arg)

        # special modes for debugging
        scrape.add_argument('--nonstrict', action='store_false', dest='strict',
                            default=True, help="don't fail immediately when"
                            " encountering validation warning")
        scrape.add_argument('--fastmode', help="scrape in fast mode",
                            action="store_true", default=False)

        # scrapelib overrides
        scrape.add_argument('-r', '--rpm', action='store', type=int,
                            dest='SCRAPELIB_RPM')
        scrape.add_argument('--timeout', action='store', type=int,
                            dest='SCRAPELIB_TIMEOUT')
        scrape.add_argument('--retries', type=int,
                            dest='SCRAPELIB_RETRY_ATTEMPTS')
        scrape.add_argument('--retry_wait', type=int,
                            dest='SCRAPELIB_RETRY_WAIT_SECONDS')

        args = parser.parse_args()

        # inject scraper paths so scraper module can be found
        for newpath in settings.SCRAPER_PATHS:
            sys.path.insert(0, newpath)

        # get metadata
        module = __import__(args.module)
        metadata = module.metadata
        module_settings = getattr(module, 'settings', {})
        abbrev = metadata['abbreviation']

        # load state settings, then command line settings
        settings.update(module_settings)
        settings.update(args)

        configure_logging(args.module)

        # configure oyster
        if settings.ENABLE_OYSTER:
            from oyster.conf import settings as oyster_settings
            oyster_settings.DOCUMENT_CLASSES[args.module + ':billtext'] = module.document_class

        # make output dir
        args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev)
        _clear_scraped_data(args.output_dir)

        # if terms aren't set, use latest
        if not args.terms:
            if args.sessions:
                for session in args.sessions:
                    args.terms.append(
                        term_for_session(metadata['abbreviation'], session,
                                         metadata))
                args.terms = list(set(args.terms or []))
            else:
                latest_term = metadata['terms'][-1]['name']
                args.terms = [latest_term]
        # only set sessions from terms if sessions weren't set
        elif not args.sessions:
            for term in metadata['terms']:
                if term['name'] in args.terms:
                    args.sessions.extend(term['sessions'])
            # dedup sessions
            args.sessions = list(set(args.sessions or []))

        if not args.sessions:
            args.sessions = [metadata['terms'][-1]['sessions'][-1]]

        # determine chambers
        if not args.chambers:
            args.chambers = ['upper', 'lower']

        if not args.actions:
            if old_scrape_compat:
                args.actions = ['scrape']
            else:
                args.actions = ['scrape', 'import', 'report']

        if not args.types:
            args.types = ['bills', 'legislators', 'votes', 'committees',
                          'alldata']
            if 'events' in metadata['feature_flags']:
                args.types.append('events')

        plan = """billy-update abbr=%s
    actions=%s
    types=%s
    sessions=%s
    terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types),
                   ','.join(args.sessions), ','.join(args.terms))
        logging.getLogger('billy').info(plan)

        scrape_data = {}

        if 'scrape' in args.actions:
            # validate then write metadata
            if hasattr(module, 'session_list'):
                session_list = module.session_list()
            else:
                session_list = []
            check_sessions(metadata, session_list)

            try:
                schema_path = os.path.join(os.path.split(__file__)[0],
                                           '../schemas/metadata.json')
                schema = json.load(open(schema_path))

                validator = DatetimeValidator()
                validator.validate(metadata, schema)
            except ValueError as e:
                logging.getLogger('billy').warning(
                    'metadata validation error: ' + str(e))

            with open(os.path.join(args.output_dir, 'metadata.json'),
                      'w') as f:
                json.dump(metadata, f, cls=JSONDateEncoder)

            run_record = []
            exec_record = {
                "run_record": run_record,
                "args": sys.argv,
                "state": abbrev
            }

            lex = None
            exc_traceback = None

            # start to run scrapers
            exec_start = dt.datetime.utcnow()

            # scraper order matters
            order = ('legislators', 'committees', 'votes', 'bills', 'events')
            try:
                for stype in order:
                    if stype in args.types:
                        run_record += _run_scraper(stype, args, metadata)
            except Exception as e:
                _traceback = _, _, exc_traceback = sys.exc_info()
                run_record += [{"exception": e, "type": stype }]
                lex = e

            exec_end = dt.datetime.utcnow()
            exec_record['started'] = exec_start
            exec_record['ended'] = exec_end
            scrape_data['scraped'] = exec_record
            scrape_data['state'] = abbrev

            for record in run_record:
                if "exception" in record:
                    ex = record['exception']
                    fb = traceback.format_exception(*_traceback)
                    trace = ""
                    for t in fb:
                        trace += t
                    record['exception'] = {
                        "type": ex.__class__.__name__,
                        "message": ex.message,
                        'traceback': trace
                    }
                    scrape_data['failure'] = True
            if lex:
                if 'import' in args.actions:
                    try:
                        db.billy_runs.save(scrape_data, safe=True)
                    except Exception:
                        raise lex, None, exc_traceback
                        # XXX: This should *NEVER* happen, but it has
                        # in the past, so we're going to catch any errors
                        # writing # to pymongo, and raise the original
                        # exception rather then let it look like Mongo's fault.
                        # Thanks for catching this, Thom.
                        #
                        # We lose the stack trace, but the Exception is the
                        # same in every other way.
                        #  -- paultag
                raise

        # imports
        if 'import' in args.actions:
            import_report = _do_imports(abbrev, args)
            scrape_data['imported'] = import_report
            # We're tying the run-logging into the import stage - since import
            # already writes to the DB, we might as well throw this in too.
            db.billy_runs.save(scrape_data, safe=True)

        # reports
        if 'report' in args.actions:
            _do_reports(abbrev, args)

    except ScrapeError as e:
        print 'Error:', e
        sys.exit(1)