def main(): Flags.PARSER.add_argument('--input_file', type=str, required=True, help='input gho csv file') Flags.PARSER.add_argument('--output_file', type=str, required=True, help='output file') Flags.InitArgs() rows_written = 0 filename = os.path.splitext(os.path.basename(Flags.ARGS.input_file))[0] print 'Processing %s' % filename with gzip.open(Flags.ARGS.input_file, 'rb') as data_file, \ gzip.open(Flags.ARGS.output_file, 'wb') as f_out: reader = csv.DictReader(data_file, skipinitialspace=True) for row in reader: res = process_row(row, filename) for data in res: if not data: continue f_out.write(data) rows_written += 1 print 'Finished processing!' print 'Rows written %d' % rows_written return 0
def main(): Flags.PARSER.add_argument( '--port', '-p', type=int, required=False, default=5000, help='Port the server should use', ) Flags.PARSER.add_argument( '--environment', '-e', required=False, type=str, default='', help='The Zenysis environment that the server should use. ' 'Can optionally be specified by setting the `ZEN_ENV` environment ' 'variable. The environment variable will take precedence over ' 'the command-line argument.', choices=[env for env in VALID_MODULES], ) Flags.InitArgs() environment = ( Flags.ARGS.environment if Flags.ARGS.environment else os.getenv('ZEN_ENV') ) if not environment: raise ValueError( 'The Zenysis environment that the server should use is not set. ' 'It can optionally be specified by setting the `ZEN_ENV` environment ' 'variable or passing the environment flag.' ) app = create_app(zenysis_environment=environment) app.run(host='0.0.0.0', port=Flags.ARGS.port)
def main(): Flags.PARSER.add_argument( '--input_file', type=str, required=True, help='CSV containing values to validate', ) Flags.PARSER.add_argument( '--datasource', type=str, default=DATASOURCE.name, help='Datasource to validate against', ) Flags.PARSER.add_argument( '--output_file', type=str, required=True, help='Output CSV to write validation results', ) Flags.InitArgs() input_file = Flags.ARGS.input_file dimensions = _extract_dimensions(input_file) LOG.info('Starting validation over dimensions: %s', dimensions) validator = PivotedCSVValidator(Flags.ARGS.datasource, dimensions) validator.parse_and_run(input_file, Flags.ARGS.output_file) return validator.passed_validation
def main(): Flags.PARSER.add_argument('--input_file', type=str, required=True, help='input file') Flags.PARSER.add_argument('--output_file', type=str, required=True, help='output file') Flags.InitArgs() filename = os.path.splitext(os.path.basename(Flags.ARGS.input_file))[0] print 'Processing %s' % filename df = load_df(Flags.ARGS.input_file) rows_written = 0 with gzip.open(Flags.ARGS.output_file, 'wb') as f_out: for _, row in df.iterrows(): res = process_row(row) f_out.write(res) rows_written += 1 print 'Finished processing!' print 'Rows written %d' % rows_written return 0
def main(): try: Cleaner.Init(Flags.PARSER) Flags.InitArgs() return Cleaner.Run() except KeyboardInterrupt as e: TermColor.Warning('KeyboardInterrupt') return 1
def main(): try: DepGraph.Init(Flags.PARSER) Flags.InitArgs() return DepGraph.Run() except KeyboardInterrupt as e: TermColor.Warning('KeyboardInterrupt') return 1
def setup_flags(): Flags.PARSER.add_argument('--input', type=str, required=True, help='Path to input json lz4') Flags.PARSER.add_argument('--output', type=str, required=True, help='Path to output json gz') Flags.InitArgs()
def _Init(self): subparsers = Flags.PARSER.add_subparsers() for cmd in self.SUPPORTED_CMDS: parser = subparsers.add_parser( cmd, conflict_handler='resolve', formatter_class=argparse.ArgumentDefaultsHelpFormatter) handler = self._GetHandler(cmd, 'init') if handler: handler(parser) parser.set_defaults(func=self._GetHandler(cmd, 'run')) Flags.InitArgs()
def main(): Flags.PARSER.add_argument('--task_id', type=str, required=True, help='The indexing task ID to lookup') Flags.PARSER.add_argument( '--block_until_completed', action='store_true', default=False, help='Poll druid and exit this script only when ' 'a task is no longer running', ) Flags.InitArgs() block_until_completed = Flags.ARGS.block_until_completed task_id = Flags.ARGS.task_id LOG.info('Fetching status for task ID: %s', task_id) elapsed_time = 0 connection_failure_count = 0 while True: if elapsed_time >= POLL_TIMEOUT: status = TaskStatus.POLL_TIMEOUT break if connection_failure_count >= MAX_CONNECTION_ERRORS: status = TaskStatus.MAX_CONNECTION_ERRORS break status = fetch_status(task_id) if status == TaskStatus.CONNECTION_ERROR: connection_failure_count += 1 elif status != TaskStatus.RUNNING or not block_until_completed: break # Poll until the task is no longer in the RUNNING state elapsed_time += POLL_INTERVAL time.sleep(POLL_INTERVAL) # Report our status every 5 minutes if (elapsed_time % 60) == 0: LOG.info( 'Task is still running. Elapsed time (minutes): %s', (old_div(elapsed_time, 60)), ) # Return exit code stored for this task status LOG.info('Task status: %s', status.name) return status.value
def _Init(self): subparsers = Flags.PARSER.add_subparsers() for cmd in self.SUPPORTED_CMDS: parser = subparsers.add_parser( cmd, conflict_handler='resolve', formatter_class=argparse.ArgumentDefaultsHelpFormatter) handler = self._GetHandler(cmd, 'init') if handler: handler(parser) parser.set_defaults(func=self._GetHandler(cmd, 'run')) Flags.InitArgs() # Get the pipeline config instance after all args have been set up. pc.PipelineConfig.Instance()
def main(): Flags.PARSER.add_argument('--woreda_mapped_path', type=str, required=True, help='path to woreda_mapped.csv') Flags.PARSER.add_argument('--weather_data_dir', type=str, required=True, help='path folder with weather data csv') Flags.PARSER.add_argument('--output_dir', type=str, required=True, help='path output dir') Flags.InitArgs() write_weather_to_gzjson(Flags.ARGS.woreda_mapped_path, \ Flags.ARGS.weather_data_dir, \ Flags.ARGS.output_dir) return 0
def main(): Flags.PARSER.add_argument( '-d', '--sql_connection_string', type=str, required=False, help='The SQL Connection String to use to connect to the SQL ' 'Database. Can also be specified via the \'DATABASE_URL\' ' 'environment variable. The inline parameter takes priority' 'over the environment variable.', ) Flags.PARSER.add_argument( '-u', '--username', type=str, required=False, help='The username of the user. MUST be a Zenysis e-mail address.', ) Flags.PARSER.add_argument('-f', '--first_name', type=str, required=False, help='The user\'s first name.') Flags.PARSER.add_argument('-l', '--last_name', type=str, required=False, help='The user\'s last name. ') Flags.PARSER.add_argument( '-p', '--password', type=str, required=False, help='The user\'s password. If none specified, this will be ' 'auto-generated. ', ) Flags.PARSER.add_argument( '-s', '--status', type=str, action='store', required=False, choices=[e.name for e in UserStatusEnum], default=UserStatusEnum.ACTIVE.name, help=('The type of SSL configuration to use. ' '1. ACTIVE - The will be able to login immediately. ' '2. INACTIVE - The user will not be able to login unless an ' 'Administrator logs in and marks the user as active. ' '3. PENDING - The user will not be able to login unless an ' 'Administrator logs in and sends the user an invite email. '), ) Flags.PARSER.add_argument( '-a', '--site_admin', action='store_true', required=False, default=False, help='If specified, make user an admin.', ) Flags.PARSER.add_argument( '-o', '--overwrite', action='store_true', required=False, default=False, help='Overwrite the user if the specified username already exists.', ) Flags.PARSER.add_argument( '-A', '--automation_user', action='store_true', required=False, default=False, help='Make a new automation user.', ) Flags.InitArgs() sql_connection_string = Flags.ARGS.sql_connection_string if not sql_connection_string: instance_configuration = load_instance_configuration_from_file() with CredentialProvider(instance_configuration) as credential_provider: sql_connection_string = credential_provider.get( 'SQLALCHEMY_DATABASE_URI') username = Flags.ARGS.username first_name = Flags.ARGS.first_name or None last_name = Flags.ARGS.last_name or None plaintext_password = Flags.ARGS.password is_site_admin = Flags.ARGS.site_admin # pylint: disable=E1136 # The types defined in Flags match exactly those defined in the Enum # there will not be a key error status = UserStatusEnum[Flags.ARGS.status] overwrite_user = Flags.ARGS.overwrite automation_user = Flags.ARGS.automation_user if automation_user: username = AUTOMATION_USERNAME first_name = AUTOMATION_FIRST_NAME last_name = AUTOMATION_LAST_NAME _, plaintext_password = get_credentials() is_site_admin = True if not username: LOG.error( 'You must provide a username if you are not creating a automation user.' ) return 5 if not overwrite_user and (not first_name or not last_name): LOG.error( 'You must provide a first and last name if you are creating a new user.' ) return 2 username = username.strip() first_name = first_name.strip() if first_name else None last_name = last_name.strip() if last_name else None if not is_email_address(username): LOG.error( 'Username \'%s\' is not valid. It must be an e-mail address.', username) return 3 Session = sessionmaker() engine = create_engine(sql_connection_string) Session.configure(bind=engine) session = Session() with Transaction(should_commit=None, get_session=lambda: session) as transaction: (new_user, plaintext_password) = create_user( transaction, username, first_name, last_name, plaintext_password, is_site_admin, overwrite_user, status, ) LOG.info( 'Successfully created/updated User \'%s\' with status \'%s\' and password \'%s\'.', get_user_string(new_user), status.name, plaintext_password, ) return 0
monitor_group.check() sleep(Flags.ARGS.sleep_interval) # this must be called after method definitions server.configure() Flags.PARSER.add_argument('--sleep_interval', type=int, default=5, help='sleep interval between checks in seconds') Flags.PARSER.add_argument('--alert_email', type=str, default='*****@*****.**', help='email address to send alert messages to') if __name__ == '__main__': Flags.InitArgs() # start the monitor loop # make it a daemon thread to make shutdown work correctly t = threading.Thread(target=loop) t.daemon = True t.start() # need to remove all args before starting server because gunicorn # will freak out if it sees unknown arguments sys.argv = sys.argv[:1] server.run()
def main(): # Required flags Flags.PARSER.add_argument( '--data_files', type=str, required=True, nargs='+', help='Path to JSON data files to be ingested', ) # Optional flags that override default values Flags.PARSER.add_argument( '--datasource_name', type=str, default='', help='Optional datasource name. If unspecified, ' 'one will be generated.', ) Flags.PARSER.add_argument( '--task_template_file', type=str, default='', help='Optional indexing template to use', ) Flags.PARSER.add_argument('--metrics_spec_file', type=str, default='', help='Optional metrics spec to use') Flags.PARSER.add_argument( '--tuning_config_file', type=str, default='', help='Optional task tuning config to use', ) Flags.PARSER.add_argument( '--task_hash_dir', type=str, default=DEFAULT_TASK_HASH_DIR, help='Directory where indexing task hashes are ' 'stored', ) Flags.PARSER.add_argument( '--output_task_id_file', type=str, default='', help='File to store the indexing task ID in', ) Flags.PARSER.add_argument( '--force', action='store_true', default=False, help='Force the datasource to be created even if ' 'a datasource already exists with the same ' 'data', ) Flags.PARSER.add_argument( '--dry_run', action='store_true', default=False, help='Issue a "noop" indexing task and skip ' 'building a new datasource', ) Flags.PARSER.add_argument( '--min_data_date', type=str, default=DEFAULT_MIN_DATA_DATE_STR, help='Optional earliest data date string: YYYY-MM-DD', ) Flags.PARSER.add_argument( '--max_data_date', type=str, default=DEFAULT_MAX_DATA_DATE_STR, help='Optional latest data date string: YYYY-MM-DD', ) Flags.InitArgs() # Create deterministic version number so that we can differentiate the # current live datasources even if they have the same datasource name. # NOTE(stephen): For some weird reason, this string version value has to # resolve to a value less than the task "lock" version, which is the # formatted timestamp that the druid indexing task actually began. This is # dumb. https://github.com/druid-io/druid/pull/3559 version = TODAY.strftime('%Y-%m-%d.%H%M%S') indexing_task = build_indexing_task(version) indexing_task.print_overview() print('') (cur_datasource, cur_version) = get_current_datasource_for_site() if (not Flags.ARGS.force and cur_datasource and cur_version and not task_contains_new_data( indexing_task, cur_datasource, cur_version)): print('##### Skipping indexing since existing datasource ' 'contains the same data specified in this task. #####') print('##### Current datasource: %s #####' % cur_datasource) print('##### Current version: %s #####' % cur_version) # TODO(stephen): Switch to the log library so that we can specify # a loglevel as a flag. Then I won't have to comment out potentially # useful debug statements. # print 'Current task hash:' # print indexing_task.get_task_hash() return 0 dry_run = Flags.ARGS.dry_run task_id = run_task(indexing_task, dry_run) if not task_id: return 1 if not dry_run: store_task_hash(indexing_task) output_task_id_file = Flags.ARGS.output_task_id_file if output_task_id_file: FileUtils.CreateFileWithData(output_task_id_file, task_id) print('Successfully started indexing task. Task ID: %s' % task_id) return 0
def setup_flags(): Flags.PARSER.add_argument( '--dimensions', type=str, nargs='*', required=False, help='List of dimensions columns, comma-separated', ) Flags.PARSER.add_argument( '--rename_cols', type=str, nargs='*', required=False, help='Optional mappings for renaming CSV columns, formatted as ' '"OriginalName:NewName". For example: region_name:RegionName', ) Flags.PARSER.add_argument( '--join_cols', type=str, nargs='*', required=False, help='Optional mappings for joining CSV columns, formatted as ' '"OriginalName1+OriginalName2:NewName".' 'Customize the concatenation by specifying --join_str.' 'For example: region_name+district_name:GeoName', ) Flags.PARSER.add_argument( '--join_str', type=str, required=False, default=' - ', help='String that is used to concatenate join_cols', ) Flags.PARSER.add_argument( '--fields', type=str, nargs='*', required=False, help='List of field columns to unpivot, comma-separated. If not ' 'specified, the data is assumed to be unpivoted with "field" and ' '"val" columns.', ) Flags.PARSER.add_argument('--date', type=str, required=True, help='The date column') Flags.PARSER.add_argument('--prefix', type=str, required=True, help='Field ID prefix') Flags.PARSER.add_argument('--sourcename', type=str, required=True, help='Name of source') Flags.PARSER.add_argument( '--disable_rollup', action='store_true', default=False, help='Should rows representing the same dimensions + date have their ' 'values combined', ) Flags.PARSER.add_argument( '--policy', type=str, required=False, default='ABORT', help='Policy for handling data anomalies', ) Flags.PARSER.add_argument( '--tracer_field', type=str, required=False, default=None, help='Field id for facility count indicators.', ) Flags.PARSER.add_argument( '--flatten_string_categories', action='store_true', default=False, help='If true, append string values to field ' 'names and set value to 1. In other words ' 'Convert "FieldName: yes" values to ' '"FieldName - yes: 1"', ) Flags.PARSER.add_argument( '--enable_field_wildcards', action='store_true', default=False, help='If true, unpivot all columns that begin with ' '*field_ rather than specifying field names ' 'individually. Overrides --fields param', ) Flags.PARSER.add_argument( '--input', type=str, required=True, help='Path to input CSV. File type can be: ' 'uncompressed (.csv), ' 'gzip compressed (.gz), ' 'or lz4 compressed (.lz4)', ) Flags.PARSER.add_argument('--output_rows', type=str, required=True, help='Path to output rows json lz4') Flags.PARSER.add_argument('--output_locations', type=str, required=True, help='Path to output locations') Flags.PARSER.add_argument('--output_fields', type=str, required=True, help='Path to output fields') Flags.PARSER.add_argument( '--output_indicators', type=str, required=False, help='Path to output JSON indicator groups', ) Flags.InitArgs()