コード例 #1
0
    def test___iter___w_more(self):
        from google.cloud.datastore.query import _pb_from_query
        connection = _Connection()
        client = self._makeClient(connection)
        query = _Query(client, self._KIND, self._PROJECT, self._NAMESPACE)
        self._addQueryResults(connection, cursor=self._END, more=True)
        self._addQueryResults(connection)
        iterator = self._makeOne(query, client)
        entities = list(iterator)

        self.assertFalse(iterator._more_results)
        self.assertEqual(len(entities), 2)
        for entity in entities:
            self.assertEqual(
                entity.key.path,
                [{'kind': self._KIND, 'id': self._ID}])
            self.assertEqual(entities[1]['foo'], u'Foo')
        qpb1 = _pb_from_query(query)
        qpb2 = _pb_from_query(query)
        qpb2.start_cursor = self._END
        EXPECTED1 = {
            'project': self._PROJECT,
            'query_pb': qpb1,
            'namespace': self._NAMESPACE,
            'transaction_id': None,
        }
        EXPECTED2 = {
            'project': self._PROJECT,
            'query_pb': qpb2,
            'namespace': self._NAMESPACE,
            'transaction_id': None,
        }
        self.assertEqual(len(connection._called_with), 2)
        self.assertEqual(connection._called_with[0], EXPECTED1)
        self.assertEqual(connection._called_with[1], EXPECTED2)
コード例 #2
0
    def test___iter___w_limit(self):
        from google.cloud.datastore.query import _pb_from_query

        connection = _Connection()
        client = self._makeClient(connection)
        query = _Query(client, self._KIND, self._PROJECT, self._NAMESPACE)
        skip1 = 4
        skip2 = 9
        self._addQueryResults(connection,
                              more=True,
                              skipped_results=skip1,
                              no_entity=True)
        self._addQueryResults(connection, more=True, skipped_results=skip2)
        self._addQueryResults(connection)
        offset = skip1 + skip2
        iterator = self._makeOne(query, client, limit=2, offset=offset)
        entities = list(iterator)

        self.assertFalse(iterator._more_results)
        self.assertEqual(len(entities), 2)
        for entity in entities:
            self.assertEqual(entity.key.path, [{
                'kind': self._KIND,
                'id': self._ID
            }])
        qpb1 = _pb_from_query(query)
        qpb1.limit.value = 2
        qpb1.offset = offset
        qpb2 = _pb_from_query(query)
        qpb2.start_cursor = self._END
        qpb2.limit.value = 2
        qpb2.offset = offset - skip1
        qpb3 = _pb_from_query(query)
        qpb3.start_cursor = self._END
        qpb3.limit.value = 1
        EXPECTED1 = {
            'project': self._PROJECT,
            'query_pb': qpb1,
            'namespace': self._NAMESPACE,
            'transaction_id': None,
        }
        EXPECTED2 = {
            'project': self._PROJECT,
            'query_pb': qpb2,
            'namespace': self._NAMESPACE,
            'transaction_id': None,
        }
        EXPECTED3 = {
            'project': self._PROJECT,
            'query_pb': qpb3,
            'namespace': self._NAMESPACE,
            'transaction_id': None,
        }
        self.assertEqual(len(connection._called_with), 3)
        self.assertEqual(connection._called_with[0], EXPECTED1)
        self.assertEqual(connection._called_with[1], EXPECTED2)
        self.assertEqual(connection._called_with[2], EXPECTED3)
コード例 #3
0
ファイル: test_query.py プロジェクト: kwoodson/gcloud-python
    def test___iter___w_limit(self):
        from google.cloud.datastore.query import _pb_from_query

        connection = _Connection()
        client = self._makeClient(connection)
        query = _Query(client, self._KIND, self._PROJECT, self._NAMESPACE)
        skip1 = 4
        skip2 = 9
        self._addQueryResults(connection, more=True, skipped_results=skip1,
                              no_entity=True)
        self._addQueryResults(connection, more=True, skipped_results=skip2)
        self._addQueryResults(connection)
        offset = skip1 + skip2
        iterator = self._makeOne(query, client, limit=2, offset=offset)
        entities = list(iterator)

        self.assertFalse(iterator._more_results)
        self.assertEqual(len(entities), 2)
        for entity in entities:
            self.assertEqual(
                entity.key.path,
                [{'kind': self._KIND, 'id': self._ID}])
        qpb1 = _pb_from_query(query)
        qpb1.limit.value = 2
        qpb1.offset = offset
        qpb2 = _pb_from_query(query)
        qpb2.start_cursor = self._END
        qpb2.limit.value = 2
        qpb2.offset = offset - skip1
        qpb3 = _pb_from_query(query)
        qpb3.start_cursor = self._END
        qpb3.limit.value = 1
        EXPECTED1 = {
            'project': self._PROJECT,
            'query_pb': qpb1,
            'namespace': self._NAMESPACE,
            'transaction_id': None,
        }
        EXPECTED2 = {
            'project': self._PROJECT,
            'query_pb': qpb2,
            'namespace': self._NAMESPACE,
            'transaction_id': None,
        }
        EXPECTED3 = {
            'project': self._PROJECT,
            'query_pb': qpb3,
            'namespace': self._NAMESPACE,
            'transaction_id': None,
        }
        self.assertEqual(len(connection._called_with), 3)
        self.assertEqual(connection._called_with[0], EXPECTED1)
        self.assertEqual(connection._called_with[1], EXPECTED2)
        self.assertEqual(connection._called_with[2], EXPECTED3)
コード例 #4
0
def delete_from_datastore(project, pipeline_options, run_locally):
    """Creates a pipeline that reads entities from Cloud Datastore."""
    p = beam.Pipeline(options=pipeline_options)
    # Create a query to read entities from datastore.
    client = datastore.Client()

    if run_locally:
        pass
        #q.add_filter('category', '=', 'BEBOP')

    q = client.query(kind='PRDebugAttendee')
    query.order = ['-created_date']
    results = list(q.fetch(1))
    if not results:
        logging.error('No PRDebugAttendee objects found')
        return

    newest_date = results[0]['created_date']
    logging.info('Deleting elements older than %s', newest_date)

    q1 = client.query(kind='PRDebugAttendee')
    q2 = client.query(kind='PRCityCategory')
    datastore_1 = p | 'read PRDebugAttendee from datastore' >> ReadFromDatastore(
        project, query._pb_from_query(q1), num_splits=400)
    datastore_2 = p | 'read PRCityCategory from datastore' >> ReadFromDatastore(
        project, query._pb_from_query(q2), num_splits=400)
    # Set up our map/reduce pipeline
    output = (
        (datastore_1, datastore_2) | beam.Flatten()
        | 'convert to entity' >> beam.Map(ConvertToEntity)
        # Find the events we want to count, and expand all the admins/attendees
        | 'find old rankings' >> beam.FlatMap(OldPRRecord, newest_date)
        # And save it all back to the database
    )
    if not run_locally:
        output | 'delete from datastore' >> beam.ParDo(DeleteFromDatastore())
        """
        (output
            | 'convert from entity' >> beam.Map(ConvertFromEntity)
            | 'write to datastore' >> WriteToDatastore(client.project)
        )
        """

    # Actually run the pipeline (all operations above are deferred).
    result = p.run()
    # Wait until completion, main thread would access post-completion job results.
    result.wait_until_finish()
    return result
コード例 #5
0
    def test_next_page_w_cursors_w_more(self):
        from base64 import urlsafe_b64decode
        from base64 import urlsafe_b64encode
        from google.cloud.datastore.query import _pb_from_query
        connection = _Connection()
        client = self._makeClient(connection)
        query = _Query(client, self._KIND, self._PROJECT, self._NAMESPACE)
        self._addQueryResults(connection, cursor=self._END, more=True)
        iterator = self._makeOne(query, client)
        iterator._start_cursor = self._START
        iterator._end_cursor = self._END
        entities, more_results, cursor = iterator.next_page()

        self.assertEqual(cursor, urlsafe_b64encode(self._END))
        self.assertTrue(more_results)
        self.assertTrue(iterator._more_results)
        self.assertEqual(iterator._skipped_results, None)
        self.assertEqual(iterator._end_cursor, None)
        self.assertEqual(urlsafe_b64decode(iterator._start_cursor), self._END)
        self.assertEqual(len(entities), 1)
        self.assertEqual(entities[0].key.path,
                         [{'kind': self._KIND, 'id': self._ID}])
        self.assertEqual(entities[0]['foo'], u'Foo')
        qpb = _pb_from_query(query)
        qpb.offset = 0
        qpb.start_cursor = urlsafe_b64decode(self._START)
        qpb.end_cursor = urlsafe_b64decode(self._END)
        EXPECTED = {
            'project': self._PROJECT,
            'query_pb': qpb,
            'namespace': self._NAMESPACE,
            'transaction_id': None,
        }
        self.assertEqual(connection._called_with, [EXPECTED])
コード例 #6
0
    def test_next_page_no_cursors_no_more_w_offset_and_limit(self):
        from google.cloud.datastore.query import _pb_from_query
        connection = _Connection()
        client = self._makeClient(connection)
        query = _Query(client, self._KIND, self._PROJECT, self._NAMESPACE)
        skipped_results = object()
        self._addQueryResults(connection, cursor=b'',
                              skipped_results=skipped_results)
        iterator = self._makeOne(query, client, 13, 29)
        entities, more_results, cursor = iterator.next_page()

        self.assertEqual(cursor, None)
        self.assertFalse(more_results)
        self.assertFalse(iterator._more_results)
        self.assertEqual(iterator._skipped_results, skipped_results)
        self.assertEqual(len(entities), 1)
        self.assertEqual(entities[0].key.path,
                         [{'kind': self._KIND, 'id': self._ID}])
        self.assertEqual(entities[0]['foo'], u'Foo')
        qpb = _pb_from_query(query)
        qpb.limit.value = 13
        qpb.offset = 29
        EXPECTED = {
            'project': self._PROJECT,
            'query_pb': qpb,
            'namespace': self._NAMESPACE,
            'transaction_id': None,
        }
        self.assertEqual(connection._called_with, [EXPECTED])
コード例 #7
0
    def test___iter___no_more(self):
        from google.cloud.datastore.query import _pb_from_query
        connection = _Connection()
        client = self._makeClient(connection)
        query = _Query(client, self._KIND, self._PROJECT, self._NAMESPACE)
        self._addQueryResults(connection)
        iterator = self._makeOne(query, client)
        entities = list(iterator)

        self.assertFalse(iterator._more_results)
        self.assertEqual(len(entities), 1)
        self.assertEqual(entities[0].key.path, [{
            'kind': self._KIND,
            'id': self._ID
        }])
        self.assertEqual(entities[0]['foo'], u'Foo')
        qpb = _pb_from_query(query)
        qpb.offset = 0
        EXPECTED = {
            'project': self._PROJECT,
            'query_pb': qpb,
            'namespace': self._NAMESPACE,
            'transaction_id': None,
        }
        self.assertEqual(connection._called_with, [EXPECTED])
コード例 #8
0
ファイル: test_query.py プロジェクト: kwoodson/gcloud-python
    def test___iter___no_more(self):
        from google.cloud.datastore.query import _pb_from_query
        connection = _Connection()
        client = self._makeClient(connection)
        query = _Query(client, self._KIND, self._PROJECT, self._NAMESPACE)
        self._addQueryResults(connection)
        iterator = self._makeOne(query, client)
        entities = list(iterator)

        self.assertFalse(iterator._more_results)
        self.assertEqual(len(entities), 1)
        self.assertEqual(entities[0].key.path,
                         [{'kind': self._KIND, 'id': self._ID}])
        self.assertEqual(entities[0]['foo'], u'Foo')
        qpb = _pb_from_query(query)
        qpb.offset = 0
        EXPECTED = {
            'project': self._PROJECT,
            'query_pb': qpb,
            'namespace': self._NAMESPACE,
            'transaction_id': None,
        }
        self.assertEqual(connection._called_with, [EXPECTED])
コード例 #9
0
 def _callFUT(self, query):
     from google.cloud.datastore.query import _pb_from_query
     return _pb_from_query(query)
コード例 #10
0
def delete_from_datastore(project, pipeline_options, run_locally):
    """Creates a pipeline that reads entities from Cloud Datastore."""
    p = beam.Pipeline(options=pipeline_options)
    # Create a query to read entities from datastore.
    client = datastore.Client()

    if run_locally:
        pass
        #q.add_filter('category', '=', 'BEBOP')

    q = client.query(kind='PRDebugAttendee')
    query.order = ['-created_date']
    results = list(q.fetch(1))
    if not results:
        logging.error('No PRDebugAttendee objects found')
        return

    newest_date = results[0]['created_date']
    logging.info('Deleting elements older than %s', newest_date)

    q1 = client.query(kind='PRDebugAttendee')
    q2 = client.query(kind='PRCityCategory')
    datastore_1 = p | 'read PRDebugAttendee from datastore' >> ReadFromDatastore(project, query._pb_from_query(q1), num_splits=400)
    datastore_2 = p | 'read PRCityCategory from datastore' >> ReadFromDatastore(project, query._pb_from_query(q2), num_splits=400)
    # Set up our map/reduce pipeline
    output = ((datastore_1, datastore_2) | beam.Flatten() | 'convert to entity' >> beam.Map(ConvertToEntity)
              # Find the events we want to count, and expand all the admins/attendees
              | 'find old rankings' >> beam.FlatMap(OldPRRecord, newest_date)
              # And save it all back to the database
             )
    if not run_locally:
        output | 'delete from datastore' >> beam.ParDo(DeleteFromDatastore())
        """
        (output
            | 'convert from entity' >> beam.Map(ConvertFromEntity)
            | 'write to datastore' >> WriteToDatastore(client.project)
        )
        """

    # Actually run the pipeline (all operations above are deferred).
    result = p.run()
    # Wait until completion, main thread would access post-completion job results.
    result.wait_until_finish()
    return result
コード例 #11
0
def run_pipeline(project, pipeline_options, args):
    """Creates a pipeline that reads entities from Cloud Datastore."""

    run_locally = args.run_locally
    run_on_fraction = args.run_on_fraction
    ground_truth_events = args.ground_truth_events
    debug_attendees = args.debug_attendees
    want_top_attendees = args.want_top_attendees
    person_locations = args.person_locations

    p = beam.Pipeline(options=pipeline_options)
    # Create a query to read entities from datastore.
    client = datastore.Client()
    q = client.query(kind='DBEvent')

    if run_locally:
        q.key_filter(client.key('DBEvent', '999'), '>')
        q.key_filter(client.key('DBEvent', 'A'), '<')

    # Let's build a timestamp to save all our objects with
    timestamp = datetime.datetime.now()

    # Set up our map/reduce pipeline
    produce_attendees = (
        p |
        'read from datastore' >> ReadFromDatastore(project, query._pb_from_query(q), num_splits=400) |
        'convert to entity' >> beam.Map(ConvertToEntity) |
        # Find the events we want to count, and expand all the admins/attendees
        'filter events' >> beam.FlatMap(CountableEvent, ground_truth_events, run_on_fraction) |
        'load fb attending' >> beam.ParDo(GetEventAndAttending()) |
        'export attendees' >> beam.FlatMap(ExportPeople)
    ) # yapf: disable


    if want_top_attendees or debug_attendees:
        top_attendee_lists = (
            produce_attendees |
            'map category -> person' >> beam.FlatMap(GroupPeopleByCategory) |
            'group by category' >> beam.GroupByKey() |
            'build top-people lists' >> beam.FlatMap(CountPeopleInfos)
        ) # yapf: disable
        if want_top_attendees:
            (
                top_attendee_lists |
                'convert dict to json' >> beam.ParDo(ConvertDictToText) |
                'write json' >> WriteToText('gs://dancedeets-hrd.appspot.com/people-ranking-outputs/city-category/%s/data' % timestamp, file_name_suffix='.txt')
                #'generate PRCityCategory database record' >> beam.ParDo(BuildPRCityCategory(), timestamp, 'PRCityCategory', TOP_CITY_N) |
                #'write PRCityCategory to datastore (unbatched)' >> beam.ParDo(WriteToDatastoreSingle(), actually_save=not run_locally)
            ) # yapf: disable

        if debug_attendees:
            attendee_event_debugging = (
                produce_attendees |
                'map city-attendee -> event' >> beam.FlatMap(DebugExportEventPeopleForGrouping) |
                'group by city-attendee' >> beam.GroupByKey() |
                'within city-attendee, group event_ids by admin_hash' >> beam.FlatMap(DebugGroupEventIds)
            ) # yapf: disable

            exploded_top_attendees = (
                top_attendee_lists |
                'explode the top attendees into a mapping: category-attendee -> YES' >> beam.FlatMap(DebugExplodeAttendeeList)
                # We don't deal with duplicates, since it requires the objects (ie our dicts) to be hashable
                # Instead, we rely on DebugFilterForTopAttendee to filter out duplicates created by the above
                # | 'remove duplicates from multiple overlapping attendee-lists' >> beam.RemoveDuplicates()
            ) # yapf: disable

            (
                # These both have the same keys:
                # keys are {city, person_id}
                (attendee_event_debugging, exploded_top_attendees) | beam.Flatten() |
                'group the attendee-debug info with the is-it-a-top-attendee info' >> beam.GroupByKey() |
                'filter for TOP_ATTENDEE' >> beam.FlatMap(DebugFilterForTopAttendee) |
                'build PRDebugAttendee' >> beam.ParDo(DebugBuildPRDebugAttendee(), timestamp) |
                'write PRDebugAttendee to datastore (unbatched)' >> beam.ParDo(WriteToDatastoreSingle(), actually_save=not run_locally and not run_on_fraction)
            ) # yapf: disable

    if person_locations:
        build_person_cities = (
            produce_attendees |
            'map attendee -> city' >> beam.FlatMap(GroupAttendenceByPerson) |
            'group by attendee' >> beam.GroupByKey() |
            'build top-cities per-person' >> beam.FlatMap(CountPersonTopCities) |
            'convert dict to json' >> beam.ParDo(ConvertDictToText) |
            'write json' >> WriteToText('gs://dancedeets-hrd.appspot.com/people-ranking-outputs/people-city/%s/data' % timestamp, file_name_suffix='.txt')
            #'build PRPersonCity' >> beam.ParDo(BuildPRPersonCity(), timestamp) |
            #'write PRPersonCity to datastore (unbatched)' >> beam.ParDo(WriteToDatastoreSingle(), actually_save=not run_locally)
        ) # yapf: disable

    """
    (output
        | 'convert from entity' >> beam.Map(ConvertFromEntity)
        | 'write to datastore' >> WriteToDatastore(client.project)
    )
    """

    # Actually run the pipeline (all operations above are deferred).
    result = p.run()
    # Wait until completion, main thread would access post-completion job results.
    result.wait_until_finish()
    return result
コード例 #12
0
def run_pipeline(project, pipeline_options, run_locally, debug_attendees):
    """Creates a pipeline that reads entities from Cloud Datastore."""
    p = beam.Pipeline(options=pipeline_options)
    # Create a query to read entities from datastore.
    client = datastore.Client()
    q = client.query(kind='DBEvent')

    if run_locally:
        q.key_filter(client.key('DBEvent', '999'), '>')
        q.key_filter(client.key('DBEvent', 'A'), '<')

    # Let's build a timestamp to save all our objects with
    timestamp = datetime.datetime.now()

    # Set up our map/reduce pipeline
    produce_attendees = (
        p | 'read from datastore' >> ReadFromDatastore(project, query._pb_from_query(q), num_splits=400) |
        'convert to entity' >> beam.Map(ConvertToEntity)
        # Find the events we want to count, and expand all the admins/attendees
        | 'filter events' >> beam.FlatMap(CountableEvent) | 'load fb attending' >> beam.ParDo(GetEventAndAttending()) |
        'export attendees' >> beam.FlatMap(ExportPeople)
    )

    top_attendee_lists = (
        produce_attendees | 'map category -> person' >> beam.FlatMap(GroupPeopleByCategory) | 'group by category' >> beam.GroupByKey() |
        'build top-people lists' >> beam.FlatMap(CountPeopleInfos)
    )

    if debug_attendees:
        attendee_event_debugging = (
            produce_attendees | 'map city-attendee -> event' >> beam.FlatMap(DebugExportEventPeopleForGrouping) |
            'group by city-attendee' >> beam.GroupByKey() |
            'within city-attendee, group event_ids by admin_hash' >> beam.FlatMap(DebugGroupEventIds)
        )

        exploded_top_attendees = (
            top_attendee_lists |
            'explode the top attendees into a mapping: category-attendee -> YES' >> beam.FlatMap(DebugExplodeAttendeeList)
            # We don't deal with duplicates, since it requires the objects (ie our dicts) to be hashable
            # Instead, we rely on DebugFilterForTopAttendee to filter out duplicates created by the above
            # | 'remove duplicates from multiple overlapping attendee-lists' >> beam.RemoveDuplicates()
        )

        (
            # These both have the same keys:
            # key contains {person_type, city, category, person_id}
            (attendee_event_debugging, exploded_top_attendees) | beam.Flatten()
            # keys are {city, person_id}
            | 'group the attendee-debug info with the is-it-a-top-attendee info' >> beam.GroupByKey() |
            'filter for TOP_ATTENDEE' >> beam.FlatMap(DebugFilterForTopAttendee) |
            'build PRDebugAttendee' >> beam.ParDo(DebugBuildPRDebugAttendee(), timestamp) |
            'write PRDebugAttendee to datastore (unbatched)' >> beam.ParDo(WriteToDatastoreSingle(), actually_save=not run_locally)
        )

    (
        top_attendee_lists |
        'generate PRCityCategory database record' >> beam.ParDo(BuildPRCityCategory(), timestamp, 'PRCityCategory', TOP_ALL_N) |
        'write PRCityCategory to datastore (unbatched)' >> beam.ParDo(WriteToDatastoreSingle(), actually_save=not run_locally)
    )
    """
    (output
        | 'convert from entity' >> beam.Map(ConvertFromEntity)
        | 'write to datastore' >> WriteToDatastore(client.project)
    )
    """

    # Actually run the pipeline (all operations above are deferred).
    result = p.run()
    # Wait until completion, main thread would access post-completion job results.
    result.wait_until_finish()
    return result