def main():
    Flags.PARSER.add_argument('--input_file',
                              type=str,
                              required=True,
                              help='input gho csv file')
    Flags.PARSER.add_argument('--output_file',
                              type=str,
                              required=True,
                              help='output file')

    Flags.InitArgs()

    rows_written = 0
    filename = os.path.splitext(os.path.basename(Flags.ARGS.input_file))[0]
    print 'Processing %s' % filename

    with gzip.open(Flags.ARGS.input_file, 'rb') as data_file, \
            gzip.open(Flags.ARGS.output_file, 'wb') as f_out:
        reader = csv.DictReader(data_file, skipinitialspace=True)
        for row in reader:
            res = process_row(row, filename)
            for data in res:
                if not data:
                    continue
                f_out.write(data)
                rows_written += 1

        print 'Finished processing!'
        print 'Rows written %d' % rows_written
        return 0
Example #2
0
def main():
    Flags.PARSER.add_argument(
        '--port',
        '-p',
        type=int,
        required=False,
        default=5000,
        help='Port the server should use',
    )
    Flags.PARSER.add_argument(
        '--environment',
        '-e',
        required=False,
        type=str,
        default='',
        help='The Zenysis environment that the server should use. '
        'Can optionally be specified by setting the `ZEN_ENV` environment '
        'variable. The environment variable will take precedence over '
        'the command-line argument.',
        choices=[env for env in VALID_MODULES],
    )
    Flags.InitArgs()

    environment = (
        Flags.ARGS.environment if Flags.ARGS.environment else os.getenv('ZEN_ENV')
    )
    if not environment:
        raise ValueError(
            'The Zenysis environment that the server should use is not set. '
            'It can optionally be specified by setting the `ZEN_ENV` environment '
            'variable or passing the environment flag.'
        )

    app = create_app(zenysis_environment=environment)
    app.run(host='0.0.0.0', port=Flags.ARGS.port)
Example #3
0
def main():
    Flags.PARSER.add_argument(
        '--input_file',
        type=str,
        required=True,
        help='CSV containing values to validate',
    )
    Flags.PARSER.add_argument(
        '--datasource',
        type=str,
        default=DATASOURCE.name,
        help='Datasource to validate against',
    )
    Flags.PARSER.add_argument(
        '--output_file',
        type=str,
        required=True,
        help='Output CSV to write validation results',
    )
    Flags.InitArgs()
    input_file = Flags.ARGS.input_file
    dimensions = _extract_dimensions(input_file)

    LOG.info('Starting validation over dimensions: %s', dimensions)
    validator = PivotedCSVValidator(Flags.ARGS.datasource, dimensions)
    validator.parse_and_run(input_file, Flags.ARGS.output_file)
    return validator.passed_validation
Example #4
0
def main():
    Flags.PARSER.add_argument('--input_file',
                              type=str,
                              required=True,
                              help='input file')
    Flags.PARSER.add_argument('--output_file',
                              type=str,
                              required=True,
                              help='output file')

    Flags.InitArgs()

    filename = os.path.splitext(os.path.basename(Flags.ARGS.input_file))[0]
    print 'Processing %s' % filename

    df = load_df(Flags.ARGS.input_file)

    rows_written = 0
    with gzip.open(Flags.ARGS.output_file, 'wb') as f_out:
        for _, row in df.iterrows():
            res = process_row(row)
            f_out.write(res)
            rows_written += 1

    print 'Finished processing!'
    print 'Rows written %d' % rows_written
    return 0
Example #5
0
def main():
    try:
        Cleaner.Init(Flags.PARSER)
        Flags.InitArgs()
        return Cleaner.Run()
    except KeyboardInterrupt as e:
        TermColor.Warning('KeyboardInterrupt')
        return 1
Example #6
0
def main():
    try:
        DepGraph.Init(Flags.PARSER)
        Flags.InitArgs()
        return DepGraph.Run()
    except KeyboardInterrupt as e:
        TermColor.Warning('KeyboardInterrupt')
        return 1
Example #7
0
def setup_flags():
    Flags.PARSER.add_argument('--input',
                              type=str,
                              required=True,
                              help='Path to input json lz4')
    Flags.PARSER.add_argument('--output',
                              type=str,
                              required=True,
                              help='Path to output json gz')

    Flags.InitArgs()
Example #8
0
    def _Init(self):
        subparsers = Flags.PARSER.add_subparsers()
        for cmd in self.SUPPORTED_CMDS:
            parser = subparsers.add_parser(
                cmd,
                conflict_handler='resolve',
                formatter_class=argparse.ArgumentDefaultsHelpFormatter)
            handler = self._GetHandler(cmd, 'init')
            if handler: handler(parser)
            parser.set_defaults(func=self._GetHandler(cmd, 'run'))

        Flags.InitArgs()
Example #9
0
def main():
    Flags.PARSER.add_argument('--task_id',
                              type=str,
                              required=True,
                              help='The indexing task ID to lookup')
    Flags.PARSER.add_argument(
        '--block_until_completed',
        action='store_true',
        default=False,
        help='Poll druid and exit this script only when '
        'a task is no longer running',
    )
    Flags.InitArgs()

    block_until_completed = Flags.ARGS.block_until_completed
    task_id = Flags.ARGS.task_id
    LOG.info('Fetching status for task ID: %s', task_id)

    elapsed_time = 0
    connection_failure_count = 0
    while True:
        if elapsed_time >= POLL_TIMEOUT:
            status = TaskStatus.POLL_TIMEOUT
            break

        if connection_failure_count >= MAX_CONNECTION_ERRORS:
            status = TaskStatus.MAX_CONNECTION_ERRORS
            break

        status = fetch_status(task_id)

        if status == TaskStatus.CONNECTION_ERROR:
            connection_failure_count += 1
        elif status != TaskStatus.RUNNING or not block_until_completed:
            break

        # Poll until the task is no longer in the RUNNING state
        elapsed_time += POLL_INTERVAL
        time.sleep(POLL_INTERVAL)

        # Report our status every 5 minutes
        if (elapsed_time % 60) == 0:
            LOG.info(
                'Task is still running. Elapsed time (minutes): %s',
                (old_div(elapsed_time, 60)),
            )

    # Return exit code stored for this task status
    LOG.info('Task status: %s', status.name)
    return status.value
Example #10
0
    def _Init(self):
        subparsers = Flags.PARSER.add_subparsers()
        for cmd in self.SUPPORTED_CMDS:
            parser = subparsers.add_parser(
                cmd,
                conflict_handler='resolve',
                formatter_class=argparse.ArgumentDefaultsHelpFormatter)
            handler = self._GetHandler(cmd, 'init')
            if handler: handler(parser)
            parser.set_defaults(func=self._GetHandler(cmd, 'run'))

        Flags.InitArgs()

        # Get the pipeline config instance after all args have been set up.
        pc.PipelineConfig.Instance()
Example #11
0
def main():
    Flags.PARSER.add_argument('--woreda_mapped_path',
                              type=str,
                              required=True,
                              help='path to woreda_mapped.csv')
    Flags.PARSER.add_argument('--weather_data_dir',
                              type=str,
                              required=True,
                              help='path folder with weather data csv')
    Flags.PARSER.add_argument('--output_dir',
                              type=str,
                              required=True,
                              help='path output dir')
    Flags.InitArgs()

    write_weather_to_gzjson(Flags.ARGS.woreda_mapped_path, \
            Flags.ARGS.weather_data_dir, \
            Flags.ARGS.output_dir)
    return 0
Example #12
0
def main():
    Flags.PARSER.add_argument(
        '-d',
        '--sql_connection_string',
        type=str,
        required=False,
        help='The SQL Connection String to use to connect to the SQL '
        'Database. Can also be specified via the \'DATABASE_URL\' '
        'environment variable. The inline parameter takes priority'
        'over the environment variable.',
    )
    Flags.PARSER.add_argument(
        '-u',
        '--username',
        type=str,
        required=False,
        help='The username of the user. MUST be a Zenysis e-mail address.',
    )
    Flags.PARSER.add_argument('-f',
                              '--first_name',
                              type=str,
                              required=False,
                              help='The user\'s first name.')
    Flags.PARSER.add_argument('-l',
                              '--last_name',
                              type=str,
                              required=False,
                              help='The user\'s last name. ')
    Flags.PARSER.add_argument(
        '-p',
        '--password',
        type=str,
        required=False,
        help='The user\'s password. If none specified, this will be '
        'auto-generated. ',
    )
    Flags.PARSER.add_argument(
        '-s',
        '--status',
        type=str,
        action='store',
        required=False,
        choices=[e.name for e in UserStatusEnum],
        default=UserStatusEnum.ACTIVE.name,
        help=('The type of SSL configuration to use. '
              '1. ACTIVE - The will be able to login immediately. '
              '2. INACTIVE - The user will not be able to login unless an '
              'Administrator logs in and marks the user as active. '
              '3. PENDING - The user will not be able to login unless an '
              'Administrator logs in and sends the user an invite email. '),
    )
    Flags.PARSER.add_argument(
        '-a',
        '--site_admin',
        action='store_true',
        required=False,
        default=False,
        help='If specified, make user an admin.',
    )
    Flags.PARSER.add_argument(
        '-o',
        '--overwrite',
        action='store_true',
        required=False,
        default=False,
        help='Overwrite the user if the specified username already exists.',
    )
    Flags.PARSER.add_argument(
        '-A',
        '--automation_user',
        action='store_true',
        required=False,
        default=False,
        help='Make a new automation user.',
    )
    Flags.InitArgs()
    sql_connection_string = Flags.ARGS.sql_connection_string
    if not sql_connection_string:
        instance_configuration = load_instance_configuration_from_file()
        with CredentialProvider(instance_configuration) as credential_provider:
            sql_connection_string = credential_provider.get(
                'SQLALCHEMY_DATABASE_URI')
    username = Flags.ARGS.username
    first_name = Flags.ARGS.first_name or None
    last_name = Flags.ARGS.last_name or None
    plaintext_password = Flags.ARGS.password
    is_site_admin = Flags.ARGS.site_admin
    # pylint: disable=E1136
    # The types defined in Flags match exactly those defined in the Enum
    # there will not be a key error
    status = UserStatusEnum[Flags.ARGS.status]
    overwrite_user = Flags.ARGS.overwrite
    automation_user = Flags.ARGS.automation_user

    if automation_user:
        username = AUTOMATION_USERNAME
        first_name = AUTOMATION_FIRST_NAME
        last_name = AUTOMATION_LAST_NAME
        _, plaintext_password = get_credentials()
        is_site_admin = True

    if not username:
        LOG.error(
            'You must provide a username if you are not creating a automation user.'
        )
        return 5

    if not overwrite_user and (not first_name or not last_name):
        LOG.error(
            'You must provide a first and last name if you are creating a new user.'
        )
        return 2

    username = username.strip()
    first_name = first_name.strip() if first_name else None
    last_name = last_name.strip() if last_name else None

    if not is_email_address(username):
        LOG.error(
            'Username \'%s\' is not valid. It must be an e-mail address.',
            username)
        return 3

    Session = sessionmaker()
    engine = create_engine(sql_connection_string)
    Session.configure(bind=engine)
    session = Session()

    with Transaction(should_commit=None,
                     get_session=lambda: session) as transaction:
        (new_user, plaintext_password) = create_user(
            transaction,
            username,
            first_name,
            last_name,
            plaintext_password,
            is_site_admin,
            overwrite_user,
            status,
        )
        LOG.info(
            'Successfully created/updated User \'%s\' with status \'%s\' and password \'%s\'.',
            get_user_string(new_user),
            status.name,
            plaintext_password,
        )

    return 0
Example #13
0
        monitor_group.check()
        sleep(Flags.ARGS.sleep_interval)


# this must be called after method definitions
server.configure()

Flags.PARSER.add_argument('--sleep_interval',
                          type=int,
                          default=5,
                          help='sleep interval between checks in seconds')

Flags.PARSER.add_argument('--alert_email',
                          type=str,
                          default='*****@*****.**',
                          help='email address to send alert messages to')

if __name__ == '__main__':
    Flags.InitArgs()

    # start the monitor loop
    # make it a daemon thread to make shutdown work correctly
    t = threading.Thread(target=loop)
    t.daemon = True
    t.start()

    # need to remove all args before starting server because gunicorn
    # will freak out if it sees unknown arguments
    sys.argv = sys.argv[:1]
    server.run()
Example #14
0
def main():
    # Required flags
    Flags.PARSER.add_argument(
        '--data_files',
        type=str,
        required=True,
        nargs='+',
        help='Path to JSON data files to be ingested',
    )

    # Optional flags that override default values
    Flags.PARSER.add_argument(
        '--datasource_name',
        type=str,
        default='',
        help='Optional datasource name. If unspecified, '
        'one will be generated.',
    )
    Flags.PARSER.add_argument(
        '--task_template_file',
        type=str,
        default='',
        help='Optional indexing template to use',
    )
    Flags.PARSER.add_argument('--metrics_spec_file',
                              type=str,
                              default='',
                              help='Optional metrics spec to use')
    Flags.PARSER.add_argument(
        '--tuning_config_file',
        type=str,
        default='',
        help='Optional task tuning config to use',
    )
    Flags.PARSER.add_argument(
        '--task_hash_dir',
        type=str,
        default=DEFAULT_TASK_HASH_DIR,
        help='Directory where indexing task hashes are '
        'stored',
    )
    Flags.PARSER.add_argument(
        '--output_task_id_file',
        type=str,
        default='',
        help='File to store the indexing task ID in',
    )
    Flags.PARSER.add_argument(
        '--force',
        action='store_true',
        default=False,
        help='Force the datasource to be created even if '
        'a datasource already exists with the same '
        'data',
    )
    Flags.PARSER.add_argument(
        '--dry_run',
        action='store_true',
        default=False,
        help='Issue a "noop" indexing task and skip '
        'building a new datasource',
    )
    Flags.PARSER.add_argument(
        '--min_data_date',
        type=str,
        default=DEFAULT_MIN_DATA_DATE_STR,
        help='Optional earliest data date string: YYYY-MM-DD',
    )
    Flags.PARSER.add_argument(
        '--max_data_date',
        type=str,
        default=DEFAULT_MAX_DATA_DATE_STR,
        help='Optional latest data date string: YYYY-MM-DD',
    )
    Flags.InitArgs()

    # Create deterministic version number so that we can differentiate the
    # current live datasources even if they have the same datasource name.
    # NOTE(stephen): For some weird reason, this string version value has to
    # resolve to a value less than the task "lock" version, which is the
    # formatted timestamp that the druid indexing task actually began. This is
    # dumb. https://github.com/druid-io/druid/pull/3559
    version = TODAY.strftime('%Y-%m-%d.%H%M%S')
    indexing_task = build_indexing_task(version)
    indexing_task.print_overview()
    print('')

    (cur_datasource, cur_version) = get_current_datasource_for_site()
    if (not Flags.ARGS.force and cur_datasource
            and cur_version and not task_contains_new_data(
                indexing_task, cur_datasource, cur_version)):
        print('##### Skipping indexing since existing datasource '
              'contains the same data specified in this task. #####')
        print('##### Current datasource: %s #####' % cur_datasource)
        print('##### Current version: %s #####' % cur_version)
        # TODO(stephen): Switch to the log library so that we can specify
        # a loglevel as a flag. Then I won't have to comment out potentially
        # useful debug statements.
        # print 'Current task hash:'
        # print indexing_task.get_task_hash()
        return 0

    dry_run = Flags.ARGS.dry_run
    task_id = run_task(indexing_task, dry_run)
    if not task_id:
        return 1

    if not dry_run:
        store_task_hash(indexing_task)

    output_task_id_file = Flags.ARGS.output_task_id_file
    if output_task_id_file:
        FileUtils.CreateFileWithData(output_task_id_file, task_id)

    print('Successfully started indexing task. Task ID: %s' % task_id)
    return 0
Example #15
0
def setup_flags():
    Flags.PARSER.add_argument(
        '--dimensions',
        type=str,
        nargs='*',
        required=False,
        help='List of dimensions columns, comma-separated',
    )
    Flags.PARSER.add_argument(
        '--rename_cols',
        type=str,
        nargs='*',
        required=False,
        help='Optional mappings for renaming CSV columns, formatted as '
        '"OriginalName:NewName". For example: region_name:RegionName',
    )
    Flags.PARSER.add_argument(
        '--join_cols',
        type=str,
        nargs='*',
        required=False,
        help='Optional mappings for joining CSV columns, formatted as '
        '"OriginalName1+OriginalName2:NewName".'
        'Customize the concatenation by specifying --join_str.'
        'For example: region_name+district_name:GeoName',
    )
    Flags.PARSER.add_argument(
        '--join_str',
        type=str,
        required=False,
        default=' - ',
        help='String that is used to concatenate join_cols',
    )

    Flags.PARSER.add_argument(
        '--fields',
        type=str,
        nargs='*',
        required=False,
        help='List of field columns to unpivot, comma-separated. If not '
        'specified, the data is assumed to be unpivoted with "field" and '
        '"val" columns.',
    )

    Flags.PARSER.add_argument('--date',
                              type=str,
                              required=True,
                              help='The date column')

    Flags.PARSER.add_argument('--prefix',
                              type=str,
                              required=True,
                              help='Field ID prefix')
    Flags.PARSER.add_argument('--sourcename',
                              type=str,
                              required=True,
                              help='Name of source')
    Flags.PARSER.add_argument(
        '--disable_rollup',
        action='store_true',
        default=False,
        help='Should rows representing the same dimensions + date have their '
        'values combined',
    )
    Flags.PARSER.add_argument(
        '--policy',
        type=str,
        required=False,
        default='ABORT',
        help='Policy for handling data anomalies',
    )
    Flags.PARSER.add_argument(
        '--tracer_field',
        type=str,
        required=False,
        default=None,
        help='Field id for facility count indicators.',
    )
    Flags.PARSER.add_argument(
        '--flatten_string_categories',
        action='store_true',
        default=False,
        help='If true, append string values to field '
        'names and set value to 1. In other words '
        'Convert "FieldName: yes" values to '
        '"FieldName - yes: 1"',
    )
    Flags.PARSER.add_argument(
        '--enable_field_wildcards',
        action='store_true',
        default=False,
        help='If true, unpivot all columns that begin with '
        '*field_ rather than specifying field names '
        'individually. Overrides --fields param',
    )
    Flags.PARSER.add_argument(
        '--input',
        type=str,
        required=True,
        help='Path to input CSV. File type can be: '
        'uncompressed (.csv), '
        'gzip compressed (.gz), '
        'or lz4 compressed (.lz4)',
    )
    Flags.PARSER.add_argument('--output_rows',
                              type=str,
                              required=True,
                              help='Path to output rows json lz4')
    Flags.PARSER.add_argument('--output_locations',
                              type=str,
                              required=True,
                              help='Path to output locations')
    Flags.PARSER.add_argument('--output_fields',
                              type=str,
                              required=True,
                              help='Path to output fields')
    Flags.PARSER.add_argument(
        '--output_indicators',
        type=str,
        required=False,
        help='Path to output JSON indicator groups',
    )

    Flags.InitArgs()