Esempio n. 1
0
def store_eval(job_dir, results):

    tf.logging.info('job_dir: %s' % job_dir)
    job_info = re.search('gs://(monorail-.+)-mlengine/(spam_trainer_\d+)',
                         job_dir)

    # Only upload eval data if this is not being run locally.
    if job_info:
        project = job_info.group(1)
        job_name = job_info.group(2)

        tf.logging.info('project: %s' % project)
        tf.logging.info('job_name: %s' % job_name)

        client_obj = client.Client(project=project)
        bucket_name = '%s-mlengine' % project
        bucket_obj = bucket.Bucket(client_obj, bucket_name)

        bucket_obj.blob = blob.Blob(job_name + '/eval_data.json', bucket_obj)
        for key, value in results[0].items():
            if isinstance(value, np.float32):
                results[0][key] = value.item()

        bucket_obj.blob.upload_from_string(json.dumps(results[0]),
                                           content_type='application/json')

    else:
        tf.logging.error('Could not find bucket "%s" to output evalution to.' %
                         job_dir)
Esempio n. 2
0
def store_component_conversion(job_dir, data):

    tf.logging.info('job_dir: %s' % job_dir)
    job_info = re.search('gs://(monorail-.+)-mlengine/(component_trainer_\d+)',
                         job_dir)

    # Check if training is being done on GAE or locally.
    if job_info:
        project = job_info.group(1)
        job_name = job_info.group(2)

        client_obj = client.Client(project=project)
        bucket_name = '%s-mlengine' % project
        bucket_obj = bucket.Bucket(client_obj, bucket_name)

        bucket_obj.blob = blob.Blob(job_name + '/component_index.json',
                                    bucket_obj)

        bucket_obj.blob.upload_from_string(json.dumps(data),
                                           content_type='application/json')

    else:
        paths = job_dir.split('/')
        for y, _ in enumerate(list(range(1, len(paths))), 1):
            if not os.path.exists("/".join(paths[:y + 1])):
                os.makedirs('/'.join(paths[:y + 1]))
        with open(job_dir + '/component_index.json', 'w') as f:
            f.write(json.dumps(data))
Esempio n. 3
0
def CompareAccuracy(args):
    client_obj = client.Client(project=args.project)
    bucket_name = '%s-mlengine' % args.project
    bucket_obj = bucket.Bucket(client_obj, bucket_name)

    model1_auc, model1_auc_pr = get_auc(args.model1, bucket_obj)
    print('%s:\nAUC: %f\tAUC Precision/Recall: %f\n' %
          (args.model1, model1_auc, model1_auc_pr))

    model2_auc, model2_auc_pr = get_auc(args.model2, bucket_obj)
    print('%s:\nAUC: %f\tAUC Precision/Recall: %f' %
          (args.model2, model2_auc, model2_auc_pr))
Esempio n. 4
0
def make_top_words_list(job_dir):
    """Returns the top (most common) words in the entire dataset for component
  prediction. If a file is already stored in GCS containing these words, the
  words from the file are simply returned. Otherwise, the most common words are
  determined and written to GCS, before being returned.

  Returns:
    A list of the most common words in the dataset (the number of them
    determined by ml_helpers.COMPONENT_FEATURES).
  """

    credentials = GoogleCredentials.get_application_default()
    storage = discovery.build('storage', 'v1', credentials=credentials)
    objects = storage.objects()

    subpaths = re.match('gs://(monorail-.*)-mlengine/(component_trainer_\d+)',
                        job_dir)

    if subpaths:
        project_id = subpaths.group(1)
        trainer_folder = subpaths.group(2)
    else:
        project_id = 'monorail-prod'

    storage_bucket = project_id + '.appspot.com'
    request = objects.list(bucket=storage_bucket,
                           prefix='component_training_data')

    response = trainer.dataset.make_api_request(request)

    items = response.get('items')
    csv_filepaths = [b.get('name') for b in items]

    final_string = ''

    for word in parse_words(csv_filepaths, objects, storage_bucket,
                            project_id):
        final_string += word + '\n'

    if subpaths:
        client_obj = client.Client(project=project_id)
        bucket_obj = bucket.Bucket(client_obj, project_id + '-mlengine')

        bucket_obj.blob = google.cloud.storage.blob.Blob(
            trainer_folder + '/' + TOP_WORDS, bucket_obj)
        bucket_obj.blob.upload_from_string(final_string,
                                           content_type='text/plain')
    return final_string.split()
Esempio n. 5
0
def _main(module, name, state, location, project, storage_class, force,
          acl, reset_acl,
          default_acl, reset_default_acl,
          iam_policy, reset_iam_policy):
    storage_client = storage.Client()
    changed = False
    bucket_obj = storage_client.lookup_bucket(name)
    iam_policy_diff = acl_diff = default_acl_diff = AclDiff(False, [], [])
    final_policy = {}
    if state == 'present':
        if not bucket_obj:
            changed = True
            bucket_obj = bucket.Bucket(storage_client, name)
            bucket_obj.location = location
            bucket_obj.storage_class = storage_class

            if not module.check_mode:
                bucket_obj.create(project=project)

        # adjust permissions
        acl_diff = _adjust_acl(module, bucket_obj.acl, acl, reset_acl)
        default_acl_diff = _adjust_acl(module, bucket_obj.default_object_acl, default_acl, reset_default_acl)
        final_policy, iam_policy_diff = _adjust_iam(module, bucket_obj, iam_policy, reset_iam_policy)

    elif state == 'absent':
        if bucket_obj:
            changed = True
            bucket_obj.acl.reload()
            bucket_obj.default_object_acl.reload()
            if not module.check_mode:
                bucket_obj.delete(force)
    elif state == 'get':
        pass
    else:
        module.exit_json(failed=True, error="Unexpected state '%s'" % state)

    changed = any([changed, acl_diff.changed, default_acl_diff.changed, iam_policy_diff.changed])
    result = {
        'changed': changed,
        'bucket': _bucket_repr(bucket_obj, final_policy),
        'changes': {
            'acl': acl_diff._asdict(),
            'default_acl': default_acl_diff._asdict(),
            'iam_policy': iam_policy_diff._asdict(),
        },
        'state': state,
    }
    module.exit_json(**result)
Esempio n. 6
0
def Predict(args):
    ml = googleapiclient.discovery.build('ml', 'v1', credentials=credentials)

    with open(args.content) as f:
        content = f.read()

    project_ID = 'projects/%s' % args.project
    full_model_name = '%s/models/%s' % (project_ID, MODEL_NAME)
    model_request = ml.projects().models().get(name=full_model_name)
    model_response = model_request.execute()

    version_name = model_response['defaultVersion']['name']

    model_name = 'component_trainer_' + re.search("v_(\d+)",
                                                  version_name).group(1)

    client_obj = client.Client(project=args.project)
    bucket_name = '%s-mlengine' % args.project
    bucket_obj = bucket.Bucket(client_obj, bucket_name)

    instance = ml_helpers.GenerateFeaturesRaw([content], COMPONENT_FEATURES,
                                              getTopWords(
                                                  bucket_name, model_name))

    request = ml.projects().predict(
        name=full_model_name,
        body={'instances': [{
            'inputs': instance['word_features']
        }]})

    try:
        response = request.execute()

        bucket_obj.blob = blob.Blob('%s/component_index.json' % model_name,
                                    bucket_obj)
        component_index = bucket_obj.blob.download_as_string()
        component_index_dict = json.loads(component_index)

        return read_indexes(response, component_index_dict)

    except googleapiclient.errors.HttpError, err:
        print('There was an error. Check the details:')
        print(err._get_reason())