예제 #1
0
    def expand(self, input_or_inputs):
        if self._data_dict:
            # Read entities from the data_dict
            entities_raw = (
                input_or_inputs
                | f"Read {self._table_name} from data_dict" >>
                _CreatePCollectionFromDict(data_dict=self._data_dict,
                                           field=self._table_name))

            # Read association table from the data_dict
            association_tuples_raw = (
                input_or_inputs
                | f"Read in {self._association_table} from data_dict" >>
                _CreatePCollectionFromDict(data_dict=self._data_dict,
                                           field=self._association_table))

        elif self._dataset:
            entity_query = f"SELECT * FROM `{self._dataset}." \
                f"{self._table_name}`"

            # Read entities from BQ
            entities_raw = (input_or_inputs
                            | f"Read {self._table_name} from BigQuery" >>
                            beam.io.Read(beam.io.BigQuerySource
                                         (query=entity_query,
                                          use_standard_sql=True)))

            association_table_query = f"SELECT {self._root_id_field}, " \
                f"{self._associated_id_field} FROM `{self._dataset}." \
                f"{self._association_table}`"

            # Read association table from BQ
            association_tuples_raw = (
                input_or_inputs
                | f"Read {self._association_table} from BigQuery" >>
                beam.io.Read(beam.io.BigQuerySource
                             (query=association_table_query,
                              use_standard_sql=True)))
        else:
            raise ValueError("No valid data source passed to the pipeline.")

        hydrate_kwargs = {'entity_class': self._entity_class,
                          'outer_connection_id_field':
                              self._entity_class.get_class_id_name(),
                          'inner_connection_id_field':
                              self._unifying_id_field}

        hydrated_entities = (entities_raw
                             | f"Hydrate {self._entity_class} instances" >>
                             beam.ParDo(_HydrateEntity(),
                                        **hydrate_kwargs))

        id_tuples_kwargs = {'root_id_field': self._root_id_field,
                            'associated_id_field': self._associated_id_field}

        association_tuples = (
            association_tuples_raw
            | f"Get root_ids and associated_ids from"
            f" {self._association_table} in tuples" >>
            beam.ParDo(_FormAssociationIDTuples(), **id_tuples_kwargs)
        )

        entities_tuples = (
            {'unifying_id_related_entity': hydrated_entities,
             'root_entity_ids': association_tuples}
            | f"Group hydrated {self._entity_class} instances to associated"
            f" ids"
            >> beam.CoGroupByKey()
        )

        return (entities_tuples
                | f"Repackage {self._entity_class} and id tuples" >>
                beam.ParDo(_RepackageUnifyingIdRootIdStructure()))
예제 #2
0
        ]
        with open(output_filename, 'a') as f:
            f.write(result[0] + "\n")
        return result


if __name__ == '__main__':
    with beam.Pipeline(options=options) as p:
        rows = (p | ReadFromText(input_filename) | beam.ParDo(Split()))

        timings = (rows | beam.ParDo(CollectTimings())
                   | "Grouping timings" >> beam.GroupByKey()
                   | "Calculating average" >> beam.CombineValues(
                       beam.combiners.MeanCombineFn()))

        users = (rows | beam.ParDo(CollectUsers())
                 | "Grouping users" >> beam.GroupByKey() | "Counting users" >>
                 beam.CombineValues(beam.combiners.CountCombineFn()))

        to_be_joined = ({
            'timings': timings,
            'users': users
        } | beam.CoGroupByKey() | beam.ParDo(WriteToCSV())
                        # |WriteToText(output_filename)
                        )

import pandavro as pdx
import pandas as pd

df = pd.read_csv(output_filename)
pdx.to_avro('./data/output.avro', df)
예제 #3
0
def run(argv=None):
    """Runs the incarceration calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    # Parse command-line arguments
    known_args, pipeline_args = parse_arguments(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = pipeline_options.get_all_options()

    query_dataset = all_pipeline_options['project'] + '.' + known_args.input
    reference_dataset = all_pipeline_options[
        'project'] + '.' + known_args.reference_input

    with beam.Pipeline(argv=pipeline_args) as p:
        # Get StatePersons
        persons = (p | 'Load StatePersons' >> BuildRootEntity(
            dataset=query_dataset,
            data_dict=None,
            root_schema_class=schema.StatePerson,
            root_entity_class=entities.StatePerson,
            unifying_id_field='person_id',
            build_related_entities=True))

        # Get StateSentenceGroups
        sentence_groups = (p | 'Load StateSentenceGroups' >> BuildRootEntity(
            dataset=query_dataset,
            data_dict=None,
            root_schema_class=schema.StateSentenceGroup,
            root_entity_class=entities.StateSentenceGroup,
            unifying_id_field='person_id',
            build_related_entities=True))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            p | 'Load StateIncarcerationSentences' >> BuildRootEntity(
                dataset=query_dataset,
                data_dict=None,
                root_schema_class=schema.StateIncarcerationSentence,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateSupervisionSentences
        supervision_sentences = (
            p | 'Load StateSupervisionSentences' >> BuildRootEntity(
                dataset=query_dataset,
                data_dict=None,
                root_schema_class=schema.StateSupervisionSentence,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field='person_id',
                build_related_entities=True))

        sentences_and_sentence_groups = (
            {
                'sentence_groups': sentence_groups,
                'incarceration_sentences': incarceration_sentences,
                'supervision_sentences': supervision_sentences
            }
            | 'Group sentences to sentence groups' >> beam.CoGroupByKey())

        # Set hydrated sentences on the corresponding sentence groups
        sentence_groups_with_hydrated_sentences = (
            sentences_and_sentence_groups
            | 'Set hydrated sentences on sentence groups' >> beam.ParDo(
                SetSentencesOnSentenceGroup()))

        # Group each StatePerson with their related entities
        person_and_sentence_groups = (
            {
                'person': persons,
                'sentence_groups': sentence_groups_with_hydrated_sentences
            }
            | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey())

        # Bring in the table that associates people and their county of residence
        person_id_to_county_query = \
            f"SELECT * FROM `{reference_dataset}.persons_to_recent_county_of_residence`"

        person_id_to_county_kv = (
            p | "Read person_id to county associations from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(query=person_id_to_county_query,
                                       use_standard_sql=True))
            | "Convert person_id to county association table to KV" >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods
        person_events = (
            person_and_sentence_groups
            | 'Classify Incarceration Events' >> beam.ParDo(
                ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv)))

        # Get dimensions to include and methodologies to use
        inclusions, _ = dimensions_and_methodologies(known_args)

        # Get pipeline job details for accessing job_id
        all_pipeline_options = pipeline_options.get_all_options()

        # The number of months to limit the monthly calculation output to
        calculation_month_limit = known_args.calculation_month_limit

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get IncarcerationMetrics
        incarceration_metrics = (
            person_events
            | 'Get Incarceration Metrics' >> GetIncarcerationMetrics(
                pipeline_options=all_pipeline_options,
                inclusions=inclusions,
                calculation_month_limit=calculation_month_limit))

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            incarceration_metrics | 'Convert to dict to be written to BQ' >>
            beam.ParDo(IncarcerationMetricWritableDict()).with_outputs(
                'admissions', 'populations', 'releases'))

        # Write the metrics to the output tables in BigQuery
        admissions_table = known_args.output + '.incarceration_admission_metrics'

        population_table = known_args.output + '.incarceration_population_metrics'

        releases_table = known_args.output + '.incarceration_release_metrics'

        _ = (writable_metrics.admissions
             | f"Write admission metrics to BQ table: {admissions_table}" >>
             beam.io.WriteToBigQuery(
                 table=admissions_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (writable_metrics.populations
             | f"Write population metrics to BQ table: {population_table}" >>
             beam.io.WriteToBigQuery(
                 table=population_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (writable_metrics.releases
             | f"Write release metrics to BQ table: {releases_table}" >>
             beam.io.WriteToBigQuery(
                 table=releases_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
    def run_test_pipeline(
        self,
        person_id: int,
        sentence: SentenceType,
        us_mo_sentence_status_rows: List[Dict[str, str]],
        expected_sentence: SentenceType,
    ):
        """Runs a test pipeline to test ConvertSentencesToStateSpecificType and checks the output against expected."""
        test_pipeline = TestPipeline()

        us_mo_sentence_statuses = (test_pipeline
                                   | "Create MO sentence statuses" >>
                                   beam.Create(us_mo_sentence_status_rows))

        sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | "Convert MO sentence status ranking table to KV tuples" >>
            beam.ParDo(ConvertDictToKVTuple(), "person_id"))

        sentences = test_pipeline | "Create person_id sentence tuple" >> beam.Create(
            [(person_id, sentence)])

        empty_sentences = test_pipeline | "Create empty PCollection" >> beam.Create(
            [])

        if isinstance(sentence, StateSupervisionSentence):
            supervision_sentences = sentences
            incarceration_sentences = empty_sentences
        else:
            incarceration_sentences = sentences
            supervision_sentences = empty_sentences

        sentences_and_statuses = (
            {
                "incarceration_sentences": incarceration_sentences,
                "supervision_sentences": supervision_sentences,
                "sentence_statuses": sentence_status_rankings_as_kv,
            }
            | "Group sentences to the sentence statuses for that person" >>
            beam.CoGroupByKey())

        output = (
            sentences_and_statuses
            | "Convert to state-specific sentences" >> beam.ParDo(
                entity_hydration_utils.ConvertSentencesToStateSpecificType()).
            with_outputs("incarceration_sentences", "supervision_sentences"))

        # Expect no change
        expected_output = [(person_id, expected_sentence)]

        if isinstance(sentence, StateSupervisionSentence):
            assert_that(
                output.supervision_sentences,
                self.convert_sentence_output_is_valid(expected_output),
            )
        else:
            assert_that(
                output.incarceration_sentences,
                self.convert_sentence_output_is_valid(expected_output),
            )

        test_pipeline.run()
예제 #5
0
    def run_test_pipeline(
            fake_person_id: int,
            state_code: str,
            dataset: str,
            expected_metric_types: Set[IncarcerationMetricType],
            allow_empty: bool = False,
            unifying_id_field_filter_set: Optional[Set[int]] = None,
            metric_types_filter: Optional[Set[str]] = None):
        """Runs a test version of the incarceration pipeline."""
        test_pipeline = TestPipeline()

        # Get StatePersons
        persons = (
            test_pipeline
            | 'Load Persons' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StatePerson,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True))

        # Get StateSentenceGroups
        sentence_groups = (
            test_pipeline
            | 'Load StateSentenceGroups' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateSentenceGroup,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            test_pipeline
            | 'Load StateIncarcerationSentences' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        # Get StateSupervisionSentences
        supervision_sentences = (
            test_pipeline | 'Load StateSupervisionSentences' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        us_mo_sentence_status_rows: List[Dict[str, Any]] = [{
            'person_id':
            fake_person_id,
            'sentence_external_id':
            'XXX',
            'sentence_status_external_id':
            'YYY',
            'status_code':
            'ZZZ',
            'status_date':
            'not_a_date',
            'status_description':
            'XYZ'
        }]

        us_mo_sentence_statuses = (test_pipeline
                                   | 'Create MO sentence statuses' >>
                                   beam.Create(us_mo_sentence_status_rows))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | 'Convert sentence status ranking table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        sentences_and_statuses = (
            {
                'incarceration_sentences': incarceration_sentences,
                'supervision_sentences': supervision_sentences,
                'sentence_statuses': us_mo_sentence_status_rankings_as_kv
            }
            | 'Group sentences to the sentence statuses for that person' >>
            beam.CoGroupByKey())

        sentences_converted = (
            sentences_and_statuses
            | 'Convert to state-specific sentences' >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    'incarceration_sentences', 'supervision_sentences'))

        sentences_and_sentence_groups = (
            {
                'sentence_groups': sentence_groups,
                'incarceration_sentences':
                sentences_converted.incarceration_sentences,
                'supervision_sentences':
                sentences_converted.supervision_sentences
            }
            | 'Group sentences to sentence groups' >> beam.CoGroupByKey())

        sentence_groups_with_hydrated_sentences = (
            sentences_and_sentence_groups
            | 'Set hydrated sentences on sentence groups' >> beam.ParDo(
                SetSentencesOnSentenceGroup()))

        # Identify IncarcerationEvents events from the StatePerson's
        # StateIncarcerationPeriods
        fake_person_id_to_county_query_result = [{
            'person_id':
            fake_person_id,
            'county_of_residence':
            _COUNTY_OF_RESIDENCE
        }]
        person_id_to_county_kv = (
            test_pipeline
            | "Read person id to county associations from BigQuery" >>
            beam.Create(fake_person_id_to_county_query_result)
            | "Convert person_id to counties to KV" >> beam.ParDo(
                ConvertDictToKVTuple(), 'person_id'))

        incarceration_period_judicial_district_association_row = \
            {'person_id': fake_person_id, 'incarceration_period_id': 123, 'judicial_district_code': 'NW'}

        ip_to_judicial_district_kv = (
            test_pipeline
            |
            "Read incarceration_period to judicial_district associations from BigQuery"
            >> beam.Create(
                [incarceration_period_judicial_district_association_row])
            | "Convert ips to judicial districts to KV" >> beam.ParDo(
                ConvertDictToKVTuple(), 'person_id'))

        state_race_ethnicity_population_count = {
            'state_code': state_code,
            'race_or_ethnicity': 'BLACK',
            'population_count': 1,
            'representation_priority': 1
        }

        state_race_ethnicity_population_counts = (
            test_pipeline
            | 'Create state_race_ethnicity_population_count table' >>
            beam.Create([state_race_ethnicity_population_count]))

        # Group each StatePerson with their related entities
        person_entities = (
            {
                'person':
                persons,
                'sentence_groups':
                sentence_groups_with_hydrated_sentences,
                'incarceration_period_judicial_district_association':
                ip_to_judicial_district_kv
            }
            | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey())

        # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods
        person_incarceration_events = (
            person_entities | 'Classify Incarceration Events' >> beam.ParDo(
                pipeline.ClassifyIncarcerationEvents(),
                AsDict(person_id_to_county_kv)))

        person_metadata = (
            persons
            | "Build the person_metadata dictionary" >> beam.ParDo(
                BuildPersonMetadata(),
                AsList(state_race_ethnicity_population_counts)))

        person_incarceration_events_with_metadata = (
            {
                'person_events': person_incarceration_events,
                'person_metadata': person_metadata
            }
            | 'Group IncarcerationEvents with person-level metadata' >>
            beam.CoGroupByKey()
            |
            'Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations'
            >> beam.ParDo(ExtractPersonEventsMetadata()))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = PipelineOptions().get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        metric_types = metric_types_filter if metric_types_filter else {'ALL'}

        # Get IncarcerationMetrics
        incarceration_metrics = (
            person_incarceration_events_with_metadata
            | 'Get Incarceration Metrics' >>  # type: ignore
            pipeline.GetIncarcerationMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types,
                calculation_end_month=None,
                calculation_month_count=-1))

        assert_that(
            incarceration_metrics,
            AssertMatchers.validate_metric_type(allow_empty=allow_empty),
            'Assert that all metrics are of the expected type.')

        assert_that(
            incarceration_metrics,
            AssertMatchers.validate_pipeline_test(expected_metric_types),
            'Assert the type of metrics produced are expected')

        test_pipeline.run()
예제 #6
0
def make_beam_pipeline(root,
                       input_filenames,
                       sample_rate,
                       debug,
                       embedding_names,
                       embedding_modules,
                       module_output_keys,
                       audio_key,
                       sample_rate_key,
                       label_key,
                       speaker_id_key,
                       average_over_time,
                       delete_audio_from_output,
                       output_filename,
                       input_format='tfrecord',
                       output_format='tfrecord',
                       suffix='Main'):
    """Construct beam pipeline for mapping from audio to embeddings.

  Args:
    root: The beam root node.
    input_filenames: Python list. List of input files.
    sample_rate: Python int, or `None`. The sample rate for all embeddings,
      or `None` if this is a TFDS dataset, or if each example has its own sample
      rate.
    debug: Python bool. Whether to operate in debug mode.
    embedding_names: Python list of embeddings.
    embedding_modules: Python list of TF-Hub modules.
    module_output_keys: Python list of strings, names of output modules.
    audio_key: Python string, the key of the audio.
    sample_rate_key: Python string or `None`, the key for.
    label_key: Python string. Field for label.
    speaker_id_key: Python string or `None`. Key for speaker ID, or `None`.
    average_over_time: Python bool. If `True`, average over the time axis.
    delete_audio_from_output: Python bool. Whether to remove audio fromm
      outputs.
    output_filename: Python string. Output filename.
    input_format: Python string. Must correspond to a function in
      `reader_functions`.
    output_format: Python string. Must correspond to a function
      `writer_functions`.
    suffix: Python string. Suffix to stage names to make them unique.
  """
    tf_examples_key_ = 'tf_examples'
    assert tf_examples_key_ not in embedding_names
    s = suffix  # for code brevity.

    # Read from input.
    input_examples = reader_functions[input_format](root, input_filenames, s)

    # In debug mode, take one input example.
    if debug:
        input_examples = (
            input_examples
            | f'TakeOne{s}' >>
            beam.transforms.combiners.Sample.FixedSizeGlobally(1)
            # Sampling generates lists, so flatten back into one collection.
            | f'DebugFlatten{s}' >> beam.FlatMap(lambda x: x))

    # Compute all the embeddings simultaneously.
    embedding_tables = {}
    for name, mod, out_key in zip(embedding_names, embedding_modules,
                                  module_output_keys):
        logging.info('Adding signal: %s %s, %s', name, mod, out_key)
        tbl = input_examples | f'ComputeEmbedding-{name}-{s}' >> beam.ParDo(
            ComputeEmbeddingMapFn(name=name,
                                  module=mod,
                                  output_key=out_key,
                                  audio_key=audio_key,
                                  sample_rate_key=sample_rate_key,
                                  sample_rate=sample_rate,
                                  average_over_time=average_over_time))
        embedding_tables[name] = tbl
    assert tf_examples_key_ not in embedding_tables
    embedding_tables[tf_examples_key_] = input_examples
    logging.info('embedding_tables: %s', embedding_tables)

    # Combine embeddings and tf.train.Example, using the common key.
    combined_tbl = (embedding_tables
                    | f'CombineEmbeddingTables-{s}' >> beam.CoGroupByKey()
                    | f'AddEmbeddings-{s}' >> beam.Map(
                        _add_embedding_column_map_fn,
                        original_example_key=tf_examples_key_,
                        delete_audio_from_output=delete_audio_from_output,
                        audio_key=audio_key,
                        label_key=label_key,
                        speaker_id_key=speaker_id_key))

    output_filename = f'{output_filename}@*'
    logging.info('Writing to %s', output_filename)
    writer_functions[output_format](combined_tbl, output_filename, s)
예제 #7
0
 def expand(self, pcoll):
     return pcoll \
            | beam.CoGroupByKey() \
            | beam.ParDo(self.ConcatFn()).with_output_types(
                typing.Tuple[int, typing.Iterable[unicode]])
예제 #8
0
    def execute_pipeline(
        self,
        pipeline: beam.Pipeline,
        all_pipeline_options: Dict[str, Any],
        state_code: str,
        input_dataset: str,
        _reference_dataset: str,
        static_reference_dataset: str,
        metric_types: List[str],
        person_id_filter_set: Optional[Set[int]],
        calculation_month_count: int = -1,
        calculation_end_month: Optional[str] = None,
    ) -> beam.Pipeline:
        # Get StatePersons
        persons = pipeline | "Load Persons" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSupervisionViolations
        supervision_violations = (
            pipeline
            | "Load SupervisionViolations"
            >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolation,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            )
        )

        # Get StateSupervisionViolationResponses
        supervision_violation_responses = (
            pipeline
            | "Load SupervisionViolationResponses"
            >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolationResponse,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            )
        )

        state_race_ethnicity_population_counts = (
            pipeline
            | "Load state_race_ethnicity_population_counts"
            >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id="state_race_ethnicity_population_counts",
                state_code_filter=state_code,
                person_id_filter_set=None,
            )
        )

        # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                "violations": supervision_violations,
                "violation_responses": supervision_violation_responses,
            }
            | "Group StateSupervisionViolationResponses to StateSupervisionViolations"
            >> beam.CoGroupByKey()
        )

        violations_with_hydrated_violation_responses = (
            supervision_violations_and_responses
            | "Set hydrated StateSupervisionViolationResponses on the StateSupervisionViolations"
            >> beam.ParDo(SetViolationResponsesOntoViolations())
        )

        person_entities = {
            "person": persons,
            "violations": violations_with_hydrated_violation_responses,
        } | "Group StatePerson to violation entities" >> beam.CoGroupByKey()

        person_violation_events = person_entities | "Get ViolationEvents" >> beam.ParDo(
            ClassifyEvents(), identifier=self.pipeline_config.identifier
        )

        person_metadata = (
            persons
            | "Build the person_metadata dictionary"
            >> beam.ParDo(
                BuildPersonMetadata(),
                state_race_ethnicity_population_counts=AsList(
                    state_race_ethnicity_population_counts
                ),
            )
        )

        person_violation_events_with_metadata = (
            {
                "person_events": person_violation_events,
                "person_metadata": person_metadata,
            }
            | "Group ViolationEvents with person-level metadata" >> beam.CoGroupByKey()
            | "Organize StatePerson, PersonMetadata and ViolationEvents for calculations"
            >> beam.ParDo(ExtractPersonEventsMetadata())
        )

        metric_types_set = set(metric_types)
        job_timestamp = datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f")
        all_pipeline_options["job_timestamp"] = job_timestamp

        # Get violation metrics
        violation_metrics = (
            person_violation_events_with_metadata
            | "Get Violation Metrics"
            >> GetMetrics(
                pipeline_options=all_pipeline_options,
                pipeline_config=self.pipeline_config,
                metric_types_to_include=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count,
            )
        )

        return violation_metrics
예제 #9
0
        # 'word counts' strings. This yields a mapping from URI to a dictionary
        # that maps the above mentioned tag strings to an iterable containing the
        # word total for that URI and word and count respectively.
        #
        # A diagram (in which '[]' just means 'iterable'):
        #
        #   URI: {'word totals': [count],  # Total words within this URI's document.
        #         'word counts': [(word, count),  # Counts of specific words
        #                         (word, count),  # within this URI's document.
        #                         ... ]}
        uri_to_word_and_count_and_total = (
            {
                'word totals': uri_to_word_total,
                'word counts': uri_to_word_and_count
            }
            | 'CoGroupByUri' >> beam.CoGroupByKey())

        # Compute a mapping from each word to a (URI, term frequency) pair for each
        # URI. A word's term frequency for a document is simply the number of times
        # that word occurs in the document divided by the total number of words in
        # the document.

        def compute_term_frequency((uri, count_and_total)):
            word_and_count = count_and_total['word counts']
            # We have an iterable for one element that we want extracted.
            [word_total] = count_and_total['word totals']
            for word, count in word_and_count:
                yield word, (uri, float(count) / word_total)

        word_to_uri_and_tf = (
            uri_to_word_and_count_and_total
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Generates the translation contributins stats.

        Returns:
            PCollection. A PCollection of 'SUCCESS x' results, where x is
            the number of generated stats..
        """
        suggestions_grouped_by_target = (
            self.pipeline
            | 'Get all non-deleted suggestion models' >> ndb_io.GetModels(
                suggestion_models.GeneralSuggestionModel.get_all(
                    include_deleted=False))
            # We need to window the models so that CoGroupByKey below
            # works properly.
            | 'Filter translate suggestions' >> beam.Filter(lambda m: (
                m.suggestion_type == feconf.SUGGESTION_TYPE_TRANSLATE_CONTENT))
            | 'Transform to suggestion domain object' >> beam.Map(
                suggestion_services.get_suggestion_from_model)
            | 'Group by target' >> beam.GroupBy(lambda m: m.target_id))
        exp_opportunities = (
            self.pipeline
            | 'Get all non-deleted opportunity models' >> ndb_io.GetModels(
                opportunity_models.ExplorationOpportunitySummaryModel.get_all(
                    include_deleted=False))
            # We need to window the models so that CoGroupByKey below
            # works properly.
            | 'Transform to opportunity domain object' >>
            beam.Map(opportunity_services.
                     get_exploration_opportunity_summary_from_model)
            | 'Group by ID' >> beam.GroupBy(lambda m: m.id))

        user_stats_results = (
            {
                'suggestion': suggestions_grouped_by_target,
                'opportunity': exp_opportunities
            }
            | 'Merge models' >> beam.CoGroupByKey()
            | 'Get rid of key' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Generate stats' >> beam.ParDo(lambda x: self._generate_stats(
                x['suggestion'][0] if len(x['suggestion']) else [],
                list(x['opportunity'][0])[0]
                if len(x['opportunity']) else None)))

        user_stats_models = (
            user_stats_results
            | 'Filter ok results' >>
            beam.Filter(lambda key_and_result: key_and_result[1].is_ok())
            | 'Unpack result' >> beam.MapTuple(lambda key, result:
                                               (key, result.unwrap()))
            | 'Combine the stats' >> beam.CombinePerKey(CombineStats())
            | 'Generate models from stats' >> beam.MapTuple(
                self._generate_translation_contribution_model))

        user_stats_error_job_run_results = (
            user_stats_results
            | 'Filter err results' >>
            beam.Filter(lambda key_and_result: key_and_result[1].is_err())
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Remove keys' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Transform result to job run result' >>
            (job_result_transforms.ResultsToJobRunResults()))

        unused_put_result = (
            user_stats_models
            | 'Put models into the datastore' >> ndb_io.PutModels())

        user_stats_models_job_run_results = (
            user_stats_models
            | 'Create job run result' >>
            (job_result_transforms.CountObjectsToJobRunResult()))

        return ((user_stats_error_job_run_results,
                 user_stats_models_job_run_results)
                | 'Merge job run results' >> beam.Flatten())
예제 #11
0
with beam.Pipeline() as co_group_by_key_pipeline:
    dept_col = (
            co_group_by_key_pipeline

            | "Read dept_data.txt" >>
            beam.io.ReadFromText("../resources/dept_data.txt")

            | "dept_col -> Create formatted (key, value) tuple" >>
            beam.Map(formatted_key_value_tuple)
    )

    loc_col = (
            co_group_by_key_pipeline

            | "Read location.txt" >>
            beam.io.ReadFromText("../resources/dept_data.txt")

            | "loc_col -> Create formatted (key, value) tuple" >>
            beam.Map(formatted_key_value_tuple)
    )

    (
            {'department_collection': dept_col, 'location_collection': loc_col}

            | "CoGroupByKey" >>
            beam.CoGroupByKey()

            | "Write output" >>
            beam.io.WriteToText("co_group_by_key/output")
    )
예제 #12
0
        beam.FlatMap(lambda stream: [stream.detrend('demean')])
        | 'Remove trend' >>
        beam.FlatMap(lambda stream: [stream.detrend('linear')])
        | 'Resample to 100 Hz' >>
        beam.FlatMap(lambda stream: [stream.resample(100)])
        | 'Trim traces' >> beam.ParDo(TrimTrace(points=3001)))

    station_location = location | '(sta, loc)' >> beam.FlatMap(
        lambda loc: [(loc['station'], loc)])
    station_pick = picks | '(sta, pick)' >> beam.FlatMap(
        lambda pick: [(pick.waveform_id.station_code, pick)])
    station_stream = streams | '(sta, stream)' >> beam.FlatMap(
        lambda stream: [(stream[0].stats.station, stream)])

    dataset = (
        {
            'pick': station_pick,
            'stream': station_stream,
            'location': station_location
        }
        | 'Join by station' >> beam.CoGroupByKey()
        | 'Drop empty station' >> beam.ParDo(DropEmptyStation())
        | 'Group stream pick by time' >> GroupStreamPick()
        | 'Generate stream PDFs' >> beam.ParDo(GeneratePDF(sigma=0.1))
        | 'Extract stream features' >> beam.ParDo(StreamFeatureExtraction()))

    transform = (dataset
                 | 'Feature to Example' >> beam.ParDo(FeatureToExample())
                 | 'Write dataset' >> tfrecordio.WriteToTFRecord(
                     tfrecord_dir, coder=beam.coders.ProtoCoder))
예제 #13
0
                                                     skip_header_lines=1)
    | "De texto para lista (chuvas)" >> beam.Map(texto_para_lista,
                                                 delimitador=',')
    | "Criando a chave UF-ANO-MES" >> beam.Map(chave_uf_ano_mes_de_lista)
    | "Soma do total de chuvas pela chave" >> beam.CombinePerKey(sum)
    | "Arrendondar resultados de chuvas" >> beam.Map(arredonda)
    # | "Mostrar resultados" >> beam.Map(print)
)

resultado = (
    # (chuvas, dengue)
    # | "Empilha as pcols" >> beam.Flatten()
    # | "Agrupa as pcols" >> beam.GroupByKey()
    ({
        'chuvas': chuvas,
        'dengue': dengue
    })
    | 'Mesclar pcols' >> beam.CoGroupByKey()
    | 'Filtrar dados vazios' >> beam.Filter(filtra_campos_vazios)
    | 'Descompactar elementos' >> beam.Map(descompactar_elementos)
    | 'Preparar csv' >> beam.Map(preparar_csv)
    # | "Mostrar resultados da união" >> beam.Map(print)
)

# uf, ano, mes, str(chuva), str(dengue)
header = 'UF;ANO;MES;CHUVA;DENGUE'

resultado | 'Criar arquivo CSV' >> WriteToText(
    'resultado', file_name_suffix='.csv', header=header)

pipeline.run()
예제 #14
0
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_input: str, output: str, calculation_month_count: int,
        metric_types: List[str], state_code: Optional[str],
        calculation_end_month: Optional[str],
        person_filter_ids: Optional[List[int]]):
    """Runs the program calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()

    input_dataset = all_pipeline_options['project'] + '.' + data_input
    reference_dataset = all_pipeline_options['project'] + '.' + reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (p | 'Load Persons' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set))

        # Get StateProgramAssignments
        program_assignments = (
            p | 'Load Program Assignments' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateProgramAssignment,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateAssessments
        assessments = (p | 'Load Assessments' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        # Get StateSupervisionPeriods
        supervision_periods = (
            p | 'Load SupervisionPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=False,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        supervision_period_to_agent_association_query = select_all_by_person_query(
            reference_dataset,
            SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, state_code,
            person_id_filter_set)

        supervision_period_to_agent_associations = (
            p | "Read Supervision Period to Agent table from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(
                    query=supervision_period_to_agent_association_query,
                    use_standard_sql=True)))

        # Convert the association table rows into key-value tuples with the value for the supervision_period_id column
        # as the key
        supervision_period_to_agent_associations_as_kv = (
            supervision_period_to_agent_associations
            | 'Convert Supervision Period to Agent table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id'))

        # Group each StatePerson with their other entities
        persons_entities = ({
            'person': persons,
            'program_assignments': program_assignments,
            'assessments': assessments,
            'supervision_periods': supervision_periods
        }
                            |
                            'Group StatePerson to StateProgramAssignments and'
                            >> beam.CoGroupByKey())

        # Identify ProgramEvents from the StatePerson's StateProgramAssignments
        person_program_events = (
            persons_entities
            | beam.ParDo(
                ClassifyProgramAssignments(),
                AsDict(supervision_period_to_agent_associations_as_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get program metrics
        program_metrics = (
            person_program_events | 'Get Program Metrics' >> GetProgramMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count))

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            program_metrics
            | 'Convert to dict to be written to BQ' >> beam.ParDo(
                ProgramMetricWritableDict()).with_outputs(
                    'participation', 'referrals'))

        # Write the metrics to the output tables in BigQuery
        referrals_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            ProgramReferralMetric)
        participation_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            ProgramParticipationMetric)

        _ = (writable_metrics.referrals
             | f"Write referral metrics to BQ table: {referrals_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=referrals_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (
            writable_metrics.participation |
            f"Write participation metrics to BQ table: {participation_table_id}"
            >> beam.io.WriteToBigQuery(
                table=participation_table_id,
                dataset=output,
                create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
예제 #15
0
    def run_test_pipeline(
            self,
            dataset: str,
            fake_supervision_period_id: int,
            unifying_id_field_filter_set: Optional[Set[int]] = None,
            metric_types_filter: Optional[Set[str]] = None):
        """Runs a test version of the program pipeline."""
        test_pipeline = TestPipeline()

        # Get StatePersons
        persons = (
            test_pipeline
            | 'Load Persons' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StatePerson,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True))

        # Get StateProgramAssignments
        program_assignments = (
            test_pipeline
            | 'Load Program Assignments' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateProgramAssignment,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        # Get StateAssessments
        assessments = (
            test_pipeline
            | 'Load Assessments' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateAssessment,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=False,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        # Get StateSupervisionPeriods
        supervision_periods = (
            test_pipeline
            | 'Load SupervisionPeriods' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateSupervisionPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=False,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        supervision_period_to_agent_map = {
            'agent_id': 1010,
            'agent_external_id': 'OFFICER0009',
            'district_external_id': '10',
            'supervision_period_id': fake_supervision_period_id
        }

        supervision_period_to_agent_associations = (
            test_pipeline
            | 'Create SupervisionPeriod to Agent table' >> beam.Create(
                [supervision_period_to_agent_map]))

        supervision_period_to_agent_associations_as_kv = (
            supervision_period_to_agent_associations
            | 'Convert SupervisionPeriod to Agent table to KV tuples' >>
            beam.ParDo(pipeline.ConvertDictToKVTuple(),
                       'supervision_period_id'))

        # Group each StatePerson with their other entities
        persons_entities = ({
            'person': persons,
            'program_assignments': program_assignments,
            'assessments': assessments,
            'supervision_periods': supervision_periods
        }
                            |
                            'Group StatePerson to StateProgramAssignments and'
                            >> beam.CoGroupByKey())

        # Identify ProgramEvents from the StatePerson's
        # StateProgramAssignments
        person_program_events = (
            persons_entities
            | beam.ParDo(
                pipeline.ClassifyProgramAssignments(),
                AsDict(supervision_period_to_agent_associations_as_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = PipelineOptions().get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        metric_types = metric_types_filter if metric_types_filter else {'ALL'}

        # Get program metrics
        program_metrics = (
            person_program_events
            | 'Get Program Metrics' >>  # type: ignore
            pipeline.GetProgramMetrics(pipeline_options=all_pipeline_options,
                                       metric_types=metric_types,
                                       calculation_end_month=None,
                                       calculation_month_count=-1))

        assert_that(program_metrics, AssertMatchers.validate_pipeline_test())

        test_pipeline.run()
예제 #16
0
def run(
    apache_beam_pipeline_options: PipelineOptions,
    data_input: str,
    reference_view_input: str,
    static_reference_input: str,
    output: str,
    metric_types: List[str],
    state_code: str,
    person_filter_ids: Optional[List[int]],
):
    """Runs the recidivism calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is
    # necessary because the BuildRootEntity function tries to access attributes
    # of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been
    # instantiated, then the relationship properties are loaded and their
    # attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()
    project_id = all_pipeline_options["project"]

    if project_id is None:
        raise ValueError(f"No project set in pipeline options: {all_pipeline_options}")

    if state_code is None:
        raise ValueError("No state_code set for pipeline")

    input_dataset = project_id + "." + data_input
    reference_dataset = project_id + "." + reference_view_input
    static_reference_dataset = project_id + "." + static_reference_input

    person_id_filter_set = set(person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = p | "Load Persons" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateIncarcerationPeriods
        incarceration_periods = p | "Load IncarcerationPeriods" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateIncarcerationPeriod,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSupervisionViolations
        supervision_violations = p | "Load SupervisionViolations" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionViolation,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # TODO(#2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = (
            p
            | "Load SupervisionViolationResponses"
            >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolationResponse,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            )
        )

        # Group StateSupervisionViolationResponses and
        # StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                "violations": supervision_violations,
                "violation_responses": supervision_violation_responses,
            }
            | "Group StateSupervisionViolationResponses to "
            "StateSupervisionViolations" >> beam.CoGroupByKey()
        )

        # Set the fully hydrated StateSupervisionViolation entities on
        # the corresponding StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | "Set hydrated StateSupervisionViolations on "
            "the StateSupervisionViolationResponses"
            >> beam.ParDo(SetViolationOnViolationsResponse())
        )

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses
        # by person_id
        incarceration_periods_and_violation_responses = (
            {
                "incarceration_periods": incarceration_periods,
                "violation_responses": violation_responses_with_hydrated_violations,
            }
            | "Group StateIncarcerationPeriods to "
            "StateSupervisionViolationResponses" >> beam.CoGroupByKey()
        )

        # Set the fully hydrated StateSupervisionViolationResponse entities on
        # the corresponding StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | "Set hydrated StateSupervisionViolationResponses on "
            "the StateIncarcerationPeriods"
            >> beam.ParDo(SetViolationResponseOnIncarcerationPeriod())
        )

        # Bring in the table that associates people and their county of residence
        person_id_to_county_kv = (
            p
            | "Load person_id_to_county_kv"
            >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            )
        )

        # Group each StatePerson with their StateIncarcerationPeriods
        person_entities = {
            "person": persons,
            "incarceration_periods": incarceration_periods_with_source_violations,
            "persons_to_recent_county_of_residence": person_id_to_county_kv,
        } | "Group StatePerson to StateIncarcerationPeriods" >> beam.CoGroupByKey()

        state_race_ethnicity_population_counts = (
            p
            | "Load state_race_ethnicity_population_counts"
            >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id="state_race_ethnicity_population_counts",
                state_code_filter=state_code,
                person_id_filter_set=None,
            )
        )

        # Identify ReleaseEvents events from the StatePerson's StateIncarcerationPeriods
        person_release_events = person_entities | "ClassifyReleaseEvents" >> beam.ParDo(
            ClassifyReleaseEvents()
        )

        person_metadata = (
            persons
            | "Build the person_metadata dictionary"
            >> beam.ParDo(
                BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts)
            )
        )

        person_release_events_with_metadata = (
            {"person_events": person_release_events, "person_metadata": person_metadata}
            | "Group ReleaseEvents with person-level metadata" >> beam.CoGroupByKey()
            | "Organize StatePerson, PersonMetadata and ReleaseEvents for calculations"
            >> beam.ParDo(ExtractPersonReleaseEventsMetadata())
        )

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f")
        all_pipeline_options["job_timestamp"] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get recidivism metrics
        recidivism_metrics = (
            person_release_events_with_metadata
            | "Get Recidivism Metrics"
            >> GetRecidivismMetrics(
                pipeline_options=all_pipeline_options, metric_types=metric_types_set
            )
        )

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            recidivism_metrics
            | "Convert to dict to be written to BQ"
            >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                ReincarcerationRecidivismMetricType.REINCARCERATION_RATE.value,
                ReincarcerationRecidivismMetricType.REINCARCERATION_COUNT.value,
            )
        )

        # Write the recidivism metrics to the output tables in BigQuery
        rates_table_id = DATAFLOW_METRICS_TO_TABLES[ReincarcerationRecidivismRateMetric]
        counts_table_id = DATAFLOW_METRICS_TO_TABLES[
            ReincarcerationRecidivismCountMetric
        ]

        _ = (
            writable_metrics.REINCARCERATION_RATE
            | f"Write rate metrics to BQ table: {rates_table_id}"
            >> WriteAppendToBigQuery(
                output_table=rates_table_id,
                output_dataset=output,
            )
        )

        _ = (
            writable_metrics.REINCARCERATION_COUNT
            | f"Write count metrics to BQ table: {counts_table_id}"
            >> WriteAppendToBigQuery(
                output_table=counts_table_id,
                output_dataset=output,
            )
        )
예제 #17
0
    def execute_pipeline(
        self,
        pipeline: beam.Pipeline,
        all_pipeline_options: Dict[str, Any],
        state_code: str,
        input_dataset: str,
        reference_dataset: str,
        static_reference_dataset: str,
        metric_types: List[str],
        person_id_filter_set: Optional[Set[int]],
        calculation_month_count: int = -1,
        calculation_end_month: Optional[str] = None,
    ) -> beam.Pipeline:
        persons = pipeline | "Load Persons" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateIncarcerationPeriods
        incarceration_periods = (
            pipeline
            | "Load IncarcerationPeriods" >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateIncarcerationPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=False,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            ))

        # Get StateSupervisionPeriods
        supervision_periods = pipeline | "Load SupervisionPeriods" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionPeriod,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Bring in the table that associates people and their county of residence
        person_id_to_county_kv = (
            pipeline
            | "Load person_id_to_county_kv" >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            ))

        # Group each StatePerson with their StateIncarcerationPeriods
        person_entities = {
            "person": persons,
            "incarceration_periods": incarceration_periods,
            "supervision_periods": supervision_periods,
            "persons_to_recent_county_of_residence": person_id_to_county_kv,
        } | "Group StatePerson to StateIncarcerationPeriods" >> beam.CoGroupByKey(
        )

        state_race_ethnicity_population_counts = (
            pipeline
            | "Load state_race_ethnicity_population_counts" >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id="state_race_ethnicity_population_counts",
                state_code_filter=state_code,
                person_id_filter_set=None,
            ))

        # Identify ReleaseEvents events from the StatePerson's StateIncarcerationPeriods
        person_release_events = person_entities | "ClassifyReleaseEvents" >> beam.ParDo(
            ClassifyEvents(), identifier=self.pipeline_config.identifier)

        person_metadata = (
            persons
            | "Build the person_metadata dictionary" >> beam.ParDo(
                BuildPersonMetadata(),
                state_race_ethnicity_population_counts=AsList(
                    state_race_ethnicity_population_counts),
            ))

        person_release_events_with_metadata = (
            {
                "person_events": person_release_events,
                "person_metadata": person_metadata
            }
            | "Group ReleaseEvents with person-level metadata" >>
            beam.CoGroupByKey()
            |
            "Organize StatePerson, PersonMetadata and ReleaseEvents for calculations"
            >> beam.ParDo(ExtractPersonReleaseEventsMetadata()))

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            "%Y-%m-%d_%H_%M_%S.%f")
        all_pipeline_options["job_timestamp"] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get recidivism metrics
        recidivism_metrics = (
            person_release_events_with_metadata
            | "Get Recidivism Metrics" >> GetMetrics(
                pipeline_options=all_pipeline_options,
                pipeline_config=self.pipeline_config,
                metric_types_to_include=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count,
            ))
        return recidivism_metrics
def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()
    # Here we add some specific command line arguments we expect.
    # This defaults the output table in your BigQuery you'll have
    # to create the example_data dataset yourself using bq mk temp
    parser.add_argument('--output', dest='output', required=False,
                        help='Output BQ table to write results to.',
                        default='lake.orders_denormalized_cogroupbykey')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)

    # DataLakeToDataMartCGBK is a class we built in this script to hold the logic for
    # transforming the file into a BigQuery table.  It also contains an example of
    # using CoGroupByKey
    data_lake_to_data_mart = DataLakeToDataMartCGBK()

    schema = parse_table_schema_from_json(data_lake_to_data_mart.schema_str)
    pipeline = beam.Pipeline(options=PipelineOptions(pipeline_args))

    # This query returns details about the account, normalized into a
    # different table.  We will be joining the data in to the main orders dataset in order
    # to create a denormalized table.
    account_details_source = (
        pipeline
        | 'Read Account Details from BigQuery ' >> beam.io.Read(
            beam.io.BigQuerySource(query="""
                SELECT
                  acct_number,
                  acct_company_name,
                  acct_group_name,
                  acct_name,
                  acct_org_name,
                  address,
                  city,
                  state,
                  zip_code,
                  country
                FROM
                  `python-dataflow-example.example_data.account`
            """, use_standard_sql=True))
        # This next stage of the pipeline maps the acct_number to a single row of
        # results from BigQuery.  Mapping this way helps Dataflow move your data arround
        # to different workers.  When later stages of the pipeline run, all results from
        # a given account number will run on one worker.
        | 'Map Account to Order Details' >> beam.Map(
            lambda row: (
                row['acct_number'], row
            )))

    orders_query = data_lake_to_data_mart.get_orders_query()
    # Read the orders from BigQuery.  This is the source of the pipeline.  All further
    # processing starts with rows read from the query results here.
    orders = (
        pipeline
        | 'Read Orders from BigQuery ' >> beam.io.Read(
            beam.io.BigQuerySource(query=orders_query, use_standard_sql=True))
        |
        # This next stage of the pipeline maps the acct_number to a single row of
        # results from BigQuery.  Mapping this way helps Dataflow move your data around
        # to different workers.  When later stages of the pipeline run, all results from
        # a given account number will run on one worker.
        'Map Account to Account Details' >> beam.Map(
            lambda row: (
                row['acct_number'], row
            )))

    # CoGroupByKey allows us to arrange the results together by key
    # Both "orders" and "account_details" are maps of
    # acct_number -> "Row of results from BigQuery".
    # The mapping is done in the above code using Beam.Map()
    result = {'orders': orders, 'account_details': account_details_source} | \
             beam.CoGroupByKey()
    # The add_account_details function is responsible for defining how to
    # join the two datasets.  It passes the results of CoGroupByKey, which
    # groups the data from the same key in each dataset together in the same
    # worker.
    joined = result | beam.FlatMap(data_lake_to_data_mart.add_account_details)
    joined | 'Write Data to BigQuery' >> beam.io.Write(
        beam.io.BigQuerySink(
            # The table name is a required argument for the BigQuery sink.
            # In this case we use the value passed in from the command line.
            known_args.output,
            # Here we use the JSON schema read in from a JSON file.
            # Specifying the schema allows the API to create the table correctly if it does not yet exist.
            schema=schema,
            # Creates the table in BigQuery if it does not yet exist.
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            # Deletes all data in the BigQuery table before writing.
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

    pipeline.run().wait_until_finish()
예제 #19
0
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_input: str, output: str, calculation_month_count: int,
        metric_types: List[str], state_code: Optional[str],
        calculation_end_month: Optional[str],
        person_filter_ids: Optional[List[int]]):
    """Runs the incarceration calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_apache_beam_pipeline_options = apache_beam_pipeline_options.get_all_options(
    )

    query_dataset = all_apache_beam_pipeline_options[
        'project'] + '.' + data_input
    reference_dataset = all_apache_beam_pipeline_options[
        'project'] + '.' + reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (p | 'Load StatePersons' >> BuildRootEntity(
            dataset=query_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set))

        # Get StateSentenceGroups
        sentence_groups = (p | 'Load StateSentenceGroups' >> BuildRootEntity(
            dataset=query_dataset,
            root_entity_class=entities.StateSentenceGroup,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            p | 'Load StateIncarcerationSentences' >> BuildRootEntity(
                dataset=query_dataset,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionSentences
        supervision_sentences = (
            p | 'Load StateSupervisionSentences' >> BuildRootEntity(
                dataset=query_dataset,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        if state_code is None or state_code == 'US_MO':
            # Bring in the reference table that includes sentence status ranking information
            us_mo_sentence_status_query = select_all_by_person_query(
                reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME,
                state_code, person_id_filter_set)

            us_mo_sentence_statuses = (
                p |
                "Read MO sentence status table from BigQuery" >> beam.io.Read(
                    beam.io.BigQuerySource(query=us_mo_sentence_status_query,
                                           use_standard_sql=True)))
        else:
            us_mo_sentence_statuses = (
                p |
                f"Generate empty MO statuses list for non-MO state run: {state_code} "
                >> beam.Create([]))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | 'Convert MO sentence status ranking table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        supervision_sentences_and_statuses = (
            {
                'incarceration_sentences': incarceration_sentences,
                'supervision_sentences': supervision_sentences,
                'sentence_statuses': us_mo_sentence_status_rankings_as_kv
            }
            | 'Group sentences to the sentence statuses for that person' >>
            beam.CoGroupByKey())

        sentences_converted = (
            supervision_sentences_and_statuses
            | 'Convert to state-specific sentences' >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    'incarceration_sentences', 'supervision_sentences'))

        sentences_and_sentence_groups = (
            {
                'sentence_groups': sentence_groups,
                'incarceration_sentences':
                sentences_converted.incarceration_sentences,
                'supervision_sentences':
                sentences_converted.supervision_sentences
            }
            | 'Group sentences to sentence groups' >> beam.CoGroupByKey())

        # Set hydrated sentences on the corresponding sentence groups
        sentence_groups_with_hydrated_sentences = (
            sentences_and_sentence_groups
            | 'Set hydrated sentences on sentence groups' >> beam.ParDo(
                SetSentencesOnSentenceGroup()))

        # Bring in the table that associates people and their county of residence
        person_id_to_county_query = select_all_by_person_query(
            reference_dataset,
            PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME,
            # TODO(3602): Once we put state_code on StatePerson objects, we can update the
            # persons_to_recent_county_of_residence query to have a state_code field, allowing us to also filter the
            # output by state_code.
            state_code_filter=None,
            person_id_filter_set=person_id_filter_set)

        person_id_to_county_kv = (
            p | "Read person_id to county associations from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(query=person_id_to_county_query,
                                       use_standard_sql=True))
            | "Convert person_id to county association table to KV" >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Bring in the judicial districts associated with incarceration_periods
        ip_to_judicial_district_query = select_all_by_person_query(
            reference_dataset,
            INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME,
            state_code, person_id_filter_set)

        ip_to_judicial_district_kv = (
            p |
            "Read incarceration_period to judicial_district associations from BigQuery"
            >> beam.io.Read(
                beam.io.BigQuerySource(query=ip_to_judicial_district_query,
                                       use_standard_sql=True))
            |
            "Convert incarceration_period to judicial_district association table to KV"
            >> beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Group each StatePerson with their related entities
        person_entities = (
            {
                'person':
                persons,
                'sentence_groups':
                sentence_groups_with_hydrated_sentences,
                'incarceration_period_judicial_district_association':
                ip_to_judicial_district_kv
            }
            | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey())

        # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods
        person_events = (
            person_entities | 'Classify Incarceration Events' >> beam.ParDo(
                ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_apache_beam_pipeline_options['job_timestamp'] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get IncarcerationMetrics
        incarceration_metrics = (
            person_events
            | 'Get Incarceration Metrics' >> GetIncarcerationMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count))

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            incarceration_metrics | 'Convert to dict to be written to BQ' >>
            beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                IncarcerationMetricType.INCARCERATION_ADMISSION.value,
                IncarcerationMetricType.INCARCERATION_POPULATION.value,
                IncarcerationMetricType.INCARCERATION_RELEASE.value))

        # Write the metrics to the output tables in BigQuery
        admissions_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            IncarcerationAdmissionMetric)
        population_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            IncarcerationPopulationMetric)
        releases_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            IncarcerationReleaseMetric)

        _ = (writable_metrics.INCARCERATION_ADMISSION
             | f"Write admission metrics to BQ table: {admissions_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=admissions_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.INCARCERATION_POPULATION
             | f"Write population metrics to BQ table: {population_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=population_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.INCARCERATION_RELEASE
             | f"Write release metrics to BQ table: {releases_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=releases_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
예제 #20
0
def run(argv=None):
    """Runs the supervision calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    # Parse command-line arguments
    known_args, pipeline_args = parse_arguments(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = pipeline_options.get_all_options()

    input_dataset = all_pipeline_options['project'] + '.' + known_args.input
    reference_dataset = all_pipeline_options['project'] + '.' + \
        known_args.reference_input

    with beam.Pipeline(argv=pipeline_args) as p:
        # Get StatePersons
        persons = (p | 'Load Persons' >> BuildRootEntity(
            dataset=input_dataset,
            data_dict=None,
            root_schema_class=schema.StatePerson,
            root_entity_class=entities.StatePerson,
            unifying_id_field='person_id',
            build_related_entities=True))

        # Get StateIncarcerationPeriods
        incarceration_periods = (
            p | 'Load IncarcerationPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                data_dict=None,
                root_schema_class=schema.StateIncarcerationPeriod,
                root_entity_class=entities.StateIncarcerationPeriod,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateSupervisionViolations
        supervision_violations = (
            p | 'Load SupervisionViolations' >> BuildRootEntity(
                dataset=input_dataset,
                data_dict=None,
                root_schema_class=schema.StateSupervisionViolation,
                root_entity_class=entities.StateSupervisionViolation,
                unifying_id_field='person_id',
                build_related_entities=True))

        # TODO(2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = (
            p | 'Load SupervisionViolationResponses' >> BuildRootEntity(
                dataset=input_dataset,
                data_dict=None,
                root_schema_class=schema.StateSupervisionViolationResponse,
                root_entity_class=entities.StateSupervisionViolationResponse,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateSupervisionSentences
        supervision_sentences = (
            p | 'Load SupervisionSentences' >> BuildRootEntity(
                dataset=input_dataset,
                data_dict=None,
                root_schema_class=schema.StateSupervisionSentence,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            p | 'Load IncarcerationSentences' >> BuildRootEntity(
                dataset=input_dataset,
                data_dict=None,
                root_schema_class=schema.StateIncarcerationSentence,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateSupervisionPeriods
        supervision_periods = (
            p | 'Load SupervisionPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                data_dict=None,
                root_schema_class=schema.StateSupervisionPeriod,
                root_entity_class=entities.StateSupervisionPeriod,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateAssessments
        assessments = (p | 'Load Assessments' >> BuildRootEntity(
            dataset=input_dataset,
            data_dict=None,
            root_schema_class=schema.StateAssessment,
            root_entity_class=entities.StateAssessment,
            unifying_id_field='person_id',
            build_related_entities=False))

        # Bring in the table that associates StateSupervisionViolationResponses to information about StateAgents
        ssvr_to_agent_association_query = f"SELECT * FROM `{reference_dataset}.ssvr_to_agent_association`"

        ssvr_to_agent_associations = (
            p | "Read SSVR to Agent table from BigQuery" >> beam.io.Read(
                beam.io.BigQuerySource(query=ssvr_to_agent_association_query,
                                       use_standard_sql=True)))

        # Convert the association table rows into key-value tuples with the value for the
        # supervision_violation_response_id column as the key
        ssvr_agent_associations_as_kv = (
            ssvr_to_agent_associations
            | 'Convert SSVR to Agent table to KV tuples' >> beam.ParDo(
                ConvertDictToKVTuple(), 'supervision_violation_response_id'))

        supervision_period_to_agent_association_query = f"SELECT * FROM `{reference_dataset}." \
                                                        f"supervision_period_to_agent_association`"

        supervision_period_to_agent_associations = (
            p | "Read Supervision Period to Agent table from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(
                    query=supervision_period_to_agent_association_query,
                    use_standard_sql=True)))

        # Convert the association table rows into key-value tuples with the value for the supervision_period_id column
        # as the key
        supervision_period_to_agent_associations_as_kv = (
            supervision_period_to_agent_associations
            | 'Convert Supervision Period to Agent table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id'))

        # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                'violations': supervision_violations,
                'violation_responses': supervision_violation_responses
            } | 'Group StateSupervisionViolationResponses to '
            'StateSupervisionViolations' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolation entities on the corresponding
        # StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | 'Set hydrated StateSupervisionViolations on '
            'the StateSupervisionViolationResponses' >> beam.ParDo(
                SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id
        incarceration_periods_and_violation_responses = (
            {
                'incarceration_periods': incarceration_periods,
                'violation_responses':
                violation_responses_with_hydrated_violations
            }
            | 'Group StateIncarcerationPeriods to '
            'StateSupervisionViolationResponses' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding
        # StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | 'Set hydrated StateSupervisionViolationResponses on '
            'the StateIncarcerationPeriods' >> beam.ParDo(
                SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their StateIncarcerationPeriods and StateSupervisionSentences
        person_periods_and_sentences = (
            {
                'person': persons,
                'assessments': assessments,
                'incarceration_periods':
                incarceration_periods_with_source_violations,
                'supervision_periods': supervision_periods,
                'supervision_sentences': supervision_sentences,
                'incarceration_sentences': incarceration_sentences,
                'violation_responses':
                violation_responses_with_hydrated_violations
            }
            | 'Group StatePerson to all entities' >> beam.CoGroupByKey())

        # The state_code to run calculations on
        state_code = known_args.state_code

        identifier_options = {'state_code': state_code}

        # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods
        person_time_buckets = (
            person_periods_and_sentences
            | 'Get SupervisionTimeBuckets' >> beam.ParDo(
                ClassifySupervisionTimeBuckets(),
                AsDict(ssvr_agent_associations_as_kv),
                AsDict(supervision_period_to_agent_associations_as_kv), **
                identifier_options))

        # Get dimensions to include and methodologies to use
        inclusions, _ = dimensions_and_methodologies(known_args)

        # Get pipeline job details for accessing job_id
        all_pipeline_options = pipeline_options.get_all_options()

        # Get the type of metric to calculate
        metric_type = known_args.metric_type

        # The number of months to limit the monthly calculation output to
        calculation_month_limit = known_args.calculation_month_limit

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get supervision metrics
        supervision_metrics = (
            person_time_buckets
            | 'Get Supervision Metrics' >> GetSupervisionMetrics(
                pipeline_options=all_pipeline_options,
                inclusions=inclusions,
                metric_type=metric_type,
                calculation_month_limit=calculation_month_limit))

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            supervision_metrics | 'Convert to dict to be written to BQ' >>
            beam.ParDo(SupervisionMetricWritableDict()).with_outputs(
                'populations', 'revocations', 'successes',
                'assessment_changes', 'revocation_analyses',
                'revocation_violation_type_analyses'))

        # Write the metrics to the output tables in BigQuery
        populations_table = known_args.output + '.supervision_population_metrics'

        revocations_table = known_args.output + '.supervision_revocation_metrics'

        successes_table = known_args.output + '.supervision_success_metrics'

        assessment_changes_table = known_args.output + '.terminated_supervision_assessment_score_change_metrics'

        revocation_analysis_table = known_args.output + '.supervision_revocation_analysis_metrics'

        revocation_violation_type_analysis_table = known_args.output + \
            '.supervision_revocation_violation_type_analysis_metrics'

        _ = (writable_metrics.populations
             | f"Write population metrics to BQ table: {populations_table}" >>
             beam.io.WriteToBigQuery(
                 table=populations_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (writable_metrics.revocations
             | f"Write revocation metrics to BQ table: {revocations_table}" >>
             beam.io.WriteToBigQuery(
                 table=revocations_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (writable_metrics.successes
             | f"Write success metrics to BQ table: {successes_table}" >>
             beam.io.WriteToBigQuery(
                 table=successes_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (
            writable_metrics.assessment_changes
            |
            f"Write assessment change metrics to BQ table: {assessment_changes_table}"
            >> beam.io.WriteToBigQuery(
                table=assessment_changes_table,
                create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (
            writable_metrics.revocation_analyses
            |
            f"Write revocation analyses metrics to BQ table: {revocation_analysis_table}"
            >> beam.io.WriteToBigQuery(
                table=revocation_analysis_table,
                create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (writable_metrics.revocation_violation_type_analyses
             |
             f"Write revocation violation type analyses metrics to BQ table: "
             f"{revocation_violation_type_analysis_table}" >>
             beam.io.WriteToBigQuery(
                 table=revocation_violation_type_analysis_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
예제 #21
0
def run(argv=None, assert_results=None):

  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input_email',
      required=True,
      help='Email database, with each line formatted as "name<TAB>email".')
  parser.add_argument(
      '--input_phone',
      required=True,
      help='Phonebook, with each line formatted as "name<TAB>phone number".')
  parser.add_argument(
      '--input_snailmail',
      required=True,
      help='Address database, with each line formatted as "name<TAB>address".')
  parser.add_argument('--output_tsv',
                      required=True,
                      help='Tab-delimited output file.')
  parser.add_argument('--output_stats',
                      required=True,
                      help='Output file for statistics about the input.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Helper: read a tab-separated key-value mapping from a text file, escape all
  # quotes/backslashes, and convert it a PCollection of (key, value) pairs.
  def read_kv_textfile(label, textfile):
    return (p
            | 'Read: %s' % label >> ReadFromText(textfile)
            | 'Backslash: %s' % label >> beam.Map(
                lambda x: re.sub(r'\\', r'\\\\', x))
            | 'EscapeQuotes: %s' % label >> beam.Map(
                lambda x: re.sub(r'"', r'\"', x))
            | 'Split: %s' % label >> beam.Map(
                lambda x: re.split(r'\t+', x, 1)))

  # Read input databases.
  email = read_kv_textfile('email', known_args.input_email)
  phone = read_kv_textfile('phone', known_args.input_phone)
  snailmail = read_kv_textfile('snailmail', known_args.input_snailmail)

  # Group together all entries under the same name.
  grouped = (email, phone, snailmail) | 'group_by_name' >> beam.CoGroupByKey()

  # Prepare tab-delimited output; something like this:
  # "name"<TAB>"email_1,email_2"<TAB>"phone"<TAB>"first_snailmail_only"
  tsv_lines = grouped | beam.Map(
      lambda (name, (email, phone, snailmail)): '\t'.join(
          ['"%s"' % name,
           '"%s"' % ','.join(email),
           '"%s"' % ','.join(phone),
           '"%s"' % next(iter(snailmail), '')]))

  # Compute some stats about our database of people.
  luddites = grouped | beam.Filter(  # People without email.
      lambda (name, (email, phone, snailmail)): not next(iter(email), None))
  writers = grouped | beam.Filter(   # People without phones.
      lambda (name, (email, phone, snailmail)): not next(iter(phone), None))
  nomads = grouped | beam.Filter(    # People without addresses.
      lambda (name, (email, phone, snailmail)): not next(iter(snailmail), None))

  num_luddites = luddites | 'Luddites' >> beam.combiners.Count.Globally()
  num_writers = writers | 'Writers' >> beam.combiners.Count.Globally()
  num_nomads = nomads | 'Nomads' >> beam.combiners.Count.Globally()

  # Write tab-delimited output.
  # pylint: disable=expression-not-assigned
  tsv_lines | 'WriteTsv' >> WriteToText(known_args.output_tsv)

  # TODO(silviuc): Move the assert_results logic to the unit test.
  if assert_results is not None:
    expected_luddites, expected_writers, expected_nomads = assert_results
    beam.assert_that(num_luddites, beam.equal_to([expected_luddites]),
                     label='assert:luddites')
    beam.assert_that(num_writers, beam.equal_to([expected_writers]),
                     label='assert:writers')
    beam.assert_that(num_nomads, beam.equal_to([expected_nomads]),
                     label='assert:nomads')
  # Execute pipeline.
  return p.run()
예제 #22
0
    def _get_page_content(self, pipeline, file_paths, dl_manager):
        """Build PCollection of un-split page content."""

        wet_file_paths = pipeline | "create_wet_files" >> beam.Create(
            file_paths["wet_files"])
        if "wet_urls" in file_paths:

            def download_url(url, downloader, pipeline):
                path = downloader.download(url)
                if not pipeline.is_local():
                    path = downloader.ship_files_with_pipeline(path, pipeline)
                return path

            dl_wet_file_paths = (
                pipeline
                | "create_wet_urls" >> beam.Create(file_paths["wet_urls"])
                | beam.Map(
                    download_url, downloader=dl_manager, pipeline=pipeline))
            wet_file_paths = (wet_file_paths,
                              dl_wet_file_paths) | beam.Flatten()

        # Parse WET files and filter by length.
        # Output: url, text
        page_content = wet_file_paths | beam.FlatMap(
            split_wet_file) | beam.Filter(is_valid_length)

        # Optionally filter for RealNews domains.
        # Output: url, text
        if self.config.realnewslike:
            with open(file_paths["realnews_domains"], "r") as f:
                realnews_domains = json.load(f)
            page_content = page_content | beam.Filter(is_realnews_domain,
                                                      realnews_domains)

        # Normalize and deduplicate by URL.
        # Output: url, text
        page_content = (page_content
                        | "normalize_url" >> beam.Map(normalize_url)
                        | "group_url" >> beam.GroupByKey()
                        | beam.Map(dedupe_urls))

        # Optionally filter for WebText-like URLs.
        # Output: url, text
        if self.config.webtextlike:
            webtextlike_urls = (
                pipeline
                | "read_webtextlike_urls" >> beam.io.ReadFromText(
                    os.path.join(file_paths["openwebtext_urls_zip"],
                                 _OPENWEBTEXT_URLS_FILE_PATTERN))
                | "add_dummy_page" >> beam.Map(lambda x: (x, ""))
                | "normal_webtext_url" >> beam.Map(normalize_url))
            page_content = ({
                "text": page_content,
                "webtextlike_urls": webtextlike_urls
            }
                            | "group_webtextlike_urls" >> beam.CoGroupByKey()
                            | beam.FlatMap(filter_by_webtextlike))

        # Optionally clean pages of badwords, boilerpolate text, and duplicate
        # spans of sentences.
        # Output: url, text
        if self.config.clean:
            with open(file_paths["badwords"], "r") as f:
                badwords = [l.strip() for l in f]
            page_content = page_content | "clean_pages" >> beam.FlatMap(
                get_clean_page_fn(badwords))
            page_content = remove_duplicate_text(page_content)

        # Optionally filter out non-`language` pages. We do this after cleaning
        # since it may change the predominate language.
        if self.config.lang != "all":
            page_content |= beam.Filter(is_language, language=self.config.lang)

        return page_content
예제 #23
0
def FilterOutSlices(  # pylint: disable=invalid-name
        values: beam.pvalue.PCollection,
        slices_count: beam.pvalue.PCollection,
        min_slice_size: int,
        error_metric_key: Text = '__ERROR__') -> beam.pvalue.PCollection:
    """Filter out slices with examples count lower than k_anonymization_count.

  Since we might filter out certain slices to preserve privacy in the case of
  small slices, to make end users aware of this, we will append filtered out
  slice keys with empty data, and a debug message explaining the omission.

  Args:
    values: PCollection of aggregated data keyed at slice_key
    slices_count: PCollection of slice keys and their example count.
    min_slice_size: If the number of examples in a specific slice is less than
      min_slice_size, then an error will be returned for that slice. This will
      be useful to ensure privacy by not displaying the aggregated data for
      smaller number of examples.
    error_metric_key: The special metric key to indicate errors.

  Returns:
    A PCollection keyed at all the possible slice_key and aggregated data for
    slice keys with example count more than min_slice_size and error
    message for filtered out slices.
  """
    class FilterOutSmallSlicesDoFn(beam.DoFn):
        """DoFn to filter out small slices."""
        def __init__(self, error_metric_key: Text):
            self.error_metric_key = error_metric_key

        def process(
            self, element: Tuple[SliceKeyType, _MetricsDict]
        ) -> Generator[Tuple[SliceKeyType, _MetricsDict], None, None]:
            """Filter out small slices.

      For slices (excluding overall slice) with examples count lower than
      min_slice_size, it adds an error message.

      Args:
        element: Tuple containing slice key and a dictionary containing
          corresponding elements from merged pcollections.

      Yields:
        PCollection of (slice_key, aggregated_data or error message)
      """
            (slice_key, value) = element
            if value['values']:
                if (not slice_key
                        or value['slices_count'][0] >= min_slice_size):
                    yield (slice_key, value['values'][0])
                else:
                    yield (
                        slice_key,
                        {
                            self.error_metric_key:  # LINT.IfChange
                                'Example count for this slice key is lower than the '
                                'minimum required value: %d. No data is aggregated for '
                                'this slice.' % min_slice_size
                            # LINT.ThenChange(../addons/fairness/frontend/fairness-metrics-board/fairness-metrics-board.js)
                        })

    return ({
        'values': values,
        'slices_count': slices_count
    }
            | 'CoGroupingSlicesCountAndAggregatedData' >> beam.CoGroupByKey()
            | 'FilterOutSmallSlices' >> beam.ParDo(
                FilterOutSmallSlicesDoFn(error_metric_key)))
예제 #24
0
  def pipeline(root):
    """Beam pipeline for generating light curve periodograms."""
    # Initialize DoFns.
    read_light_curve = light_curve_fns.ReadLightCurveDoFn(
        FLAGS.kepler_data_dir, injected_group=FLAGS.injected_group)

    get_top_result = bls_fns.GetTopResultDoFn("median_flattened")

    count_transits = light_curve_fns.CountTransitsDoFn(
        FLAGS.complete_transit_fraction)

    process_light_curve = light_curve_fns.ProcessLightCurveDoFn(
        gap_width=0.75,
        normalize_method="spline",
        normalize_args={
            "bkspace_min": 0.5,
            "bkspace_max": 20,
            "bkspace_num": 20,
            "penalty_coeff": 1.0,
        },
        upward_outlier_sigma_cut=FLAGS.upward_outlier_sigma_cut,
        output_name="light_curve_for_predictions")

    make_predictions = prediction_fns.MakePredictionsDoFn(
        FLAGS.astronet_model, FLAGS.astronet_config_name,
        FLAGS.astronet_config_json, FLAGS.astronet_model_dir)

    to_csv = prediction_fns.ToCsvDoFn()

    top_results = []
    for planet_num in range(FLAGS.detections_per_target):
      read_stage_name = "read_top_results-%d" % planet_num
      prepare_inputs_stage_name = "prepare_inputs-%d" % planet_num
      top_results.append(
          root
          | read_stage_name >> beam.io.tfrecordio.ReadFromTFRecord(
              os.path.join(FLAGS.input_dir, "top-results-%d*" % planet_num),
              coder=beam.coders.ProtoCoder(bls_pb2.TopResults))
          | prepare_inputs_stage_name >> beam.ParDo(PrepareInputs(planet_num)))

    # Output: PCollection({
    #    "kepler_id",
    #    "raw_light_curve",
    #    "light_curve_for_predictions",
    # })
    light_curves = (
        # TODO(shallue): replace top_results[0] with getting all keys and
        # deduping and removing the reshuffle.
        top_results[0]
        | "reshuffle_top_results" >> beam.Reshuffle()
        | "get_kepids" >> beam.Map(lambda kv: {"kepler_id": kv[0]})
        | "read_light_curves" >> beam.ParDo(read_light_curve)
        | "process_light_curves" >> beam.ParDo(process_light_curve)
        | "pair_lc_with_kepid" >> beam.Map(_pair_with_kepid))

    all_detections = top_results | "flatten_top_results" >> beam.Flatten()
    detections_and_light_curves = (
        [light_curves, all_detections]
        | "group_by_kepid" >> beam.CoGroupByKey()
        | "pair_light_curves_and_detections" >> beam.ParDo(
            PairLightCurveAndDetectionsDoFn()))

    predictions = (
        detections_and_light_curves
        | "get_top_result" >> beam.ParDo(get_top_result)
        | "count_transits" >> beam.ParDo(count_transits)
        | "make_predictions" >> beam.ParDo(make_predictions))

    # Write predictions
    (predictions | "to_csv" >> beam.ParDo(to_csv)
     | "reshuffle_csv_lines" >> beam.Reshuffle()
     | "write_csv" >> beam.io.WriteToText(
         os.path.join(FLAGS.output_dir, "predictions.csv"),
         num_shards=1,
         header=to_csv.csv_header(),
         shard_name_template=""))

    # Write local and global views.
    _write_examples(predictions)
예제 #25
0
def make_beam_pipeline(root,
                       input_filenames,
                       sample_rate,
                       debug,
                       embedding_names,
                       embedding_modules,
                       module_output_keys,
                       audio_key,
                       sample_rate_key,
                       label_key,
                       speaker_id_key,
                       average_over_time,
                       delete_audio_from_output,
                       output_filename,
                       split_embeddings_into_separate_tables=False,
                       use_frontend_fn=False,
                       model_input_min_length=None,
                       input_format='tfrecord',
                       output_format='tfrecord',
                       suffix='Main'):
    """Construct beam pipeline for mapping from audio to embeddings.

  Args:
    root: The beam root node.
    input_filenames: Python list. List of input files.
    sample_rate: Python int, or `None`. The sample rate for all embeddings,
      or `None` if this is a TFDS dataset, or if each example has its own sample
      rate.
    debug: Python bool. Whether to operate in debug mode.
    embedding_names: Python list of embeddings.
    embedding_modules: Python list of TF-Hub modules.
    module_output_keys: Python list of strings, names of output modules.
    audio_key: Python string, the key of the audio.
    sample_rate_key: Python string or `None`, the key for.
    label_key: Python string. Field for label.
    speaker_id_key: Python string or `None`. Key for speaker ID, or `None`.
    average_over_time: Python bool. If `True`, average over the time axis.
    delete_audio_from_output: Python bool. Whether to remove audio fromm
      outputs.
    output_filename: Python string. Output filename.
    split_embeddings_into_separate_tables: Python bool. If true, write each
      embedding to a separate table.
    use_frontend_fn: If `true`, call frontend fn on audio before passing to the
      model.
    model_input_min_length: Min length to the model, or `None`. 0-pad inputs to
      this length, if necessary. Note that frontends usually contain their own
      length logic, unless the model is in TFLite format.
    input_format: Python string. Must correspond to a function in
      `reader_functions`.
    output_format: Python string. Must correspond to a function
      `writer_functions`.
    suffix: Python string. Suffix to stage names to make them unique.
  """
    tf_examples_key_ = 'tf_examples'
    assert tf_examples_key_ not in embedding_names
    s = suffix  # for code brevity.

    # Read from input.
    input_examples = reader_functions[input_format](root, input_filenames, s)

    # In debug mode, take one input example.
    if debug:
        input_examples = (
            input_examples
            | f'TakeOne{s}' >>
            beam.transforms.combiners.Sample.FixedSizeGlobally(1)
            # Sampling generates lists, so flatten back into one collection.
            | f'DebugFlatten{s}' >> beam.FlatMap(lambda x: x))

    # Compute all the embeddings simultaneously.
    embedding_tables = {}
    for name, mod, out_key in zip(embedding_names, embedding_modules,
                                  module_output_keys):
        logging.info('Adding signal: %s %s, %s', name, mod, out_key)
        tbl = input_examples | f'ComputeEmbedding-{name}-{s}' >> beam.ParDo(
            ComputeEmbeddingMapFn(
                name=name,
                module=mod,
                output_key=out_key,
                audio_key=audio_key,
                sample_rate_key=sample_rate_key,
                sample_rate=sample_rate,
                average_over_time=average_over_time,
                feature_fn=_default_feature_fn if use_frontend_fn else None,
                model_input_min_length=model_input_min_length))
        embedding_tables[name] = tbl
    assert tf_examples_key_ not in embedding_tables
    embedding_tables[tf_examples_key_] = input_examples
    logging.info('embedding_tables: %s', embedding_tables)

    # Either write to one table with all embeddings, or one table per embedding.
    if split_embeddings_into_separate_tables:
        output_table_dicts = [(k, {
            k: v,
            tf_examples_key_: input_examples
        }) for k, v in embedding_tables.items() if k != tf_examples_key_]
    else:
        output_table_dicts = [('all', embedding_tables)]

    # Combine embeddings and tf.train.Example, using the common key.
    writer_function = writer_functions[output_format]
    for name, embedding_tables in output_table_dicts:
        if split_embeddings_into_separate_tables:
            cur_s = f'{name}-{s}'
            # Add `name` as a subdir.
            dirname, basename = os.path.split(output_filename)
            cur_output_filename = os.path.join(dirname, name, f'{basename}@*')
        else:
            cur_s = s
            cur_output_filename = f'{output_filename}@*'
        combined_tbl = (
            embedding_tables
            | f'CombineEmbeddingTables-{cur_s}' >> beam.CoGroupByKey()
            | f'AddEmbeddings-{cur_s}' >> beam.Map(
                _add_embedding_column_map_fn,
                original_example_key=tf_examples_key_,
                delete_audio_from_output=delete_audio_from_output,
                audio_key=audio_key,
                label_key=label_key,
                speaker_id_key=speaker_id_key))
        logging.info('Writing to %s', cur_output_filename)
        writer_function(combined_tbl, cur_output_filename, cur_s)
예제 #26
0
        beam.io.BigQuerySource(query=emp_query_str, use_standard_sql=True))

    # apply ParDo to the Job records
    job_tuple_pcoll = job_query_results | 'Transform Job Record' >> beam.ParDo(
        TransformJobRecord())
    emp_tuple_pcoll = emp_query_results | 'Transform Employer Record' >> beam.ParDo(
        TransformEmployerRecord())

    job_tuple_pcoll | 'Write to File 1' >> WriteToText(DIR_PATH +
                                                       'output_job_tuples.txt')
    emp_tuple_pcoll | 'Write to File 2' >> WriteToText(DIR_PATH +
                                                       'output_emp_tuples.txt')

    # Join Job and Employer on employer_name, employer_city
    joined_pcoll = (job_tuple_pcoll, emp_tuple_pcoll
                    ) | 'Join Job and Employer' >> beam.CoGroupByKey()
    joined_pcoll | 'Write to File 3' >> WriteToText(DIR_PATH +
                                                    'output_joined_pcoll.txt')

    job_bq_pcoll = joined_pcoll | 'Transform to BigQuery Record' >> beam.ParDo(
        MakeBigQueryRecord())
    job_bq_pcoll | 'Write to File 4' >> WriteToText(DIR_PATH +
                                                    'output_bq_record.txt')

    qualified_table_name = PROJECT_ID + ':h1b_split.Job'
    table_schema = 'job_id:STRING,employer_id:STRING,employment_start_year:INTEGER,employment_start_date:DATE,employment_end_date:DATE,job_title:STRING,'\
                    'wage_rate_of_pay_from:FLOAT,wage_rate_of_pay_to:FLOAT,wage_unit_of_pay:STRING,worksite_city:STRING,worksite_county:STRING,'\
                    'worksite_state:STRING,worksite_postal_code:STRING,soc_code:STRING,soc_name:STRING,total_workers:INTEGER,full_time_position:BOOLEAN,'\
                    'prevailing_wage:FLOAT,pw_unit_of_pay:STRING,pw_source:STRING,pw_source_year:STRING,pw_source_other:STRING'

    job_bq_pcoll | 'Write to BigQuery' >> beam.io.Write(
예제 #27
0
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_input: str, output: str, metric_types: List[str],
        state_code: Optional[str], person_filter_ids: Optional[List[int]]):
    """Runs the recidivism calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is
    # necessary because the BuildRootEntity function tries to access attributes
    # of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been
    # instantiated, then the relationship properties are loaded and their
    # attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()

    query_dataset = all_pipeline_options['project'] + '.' + data_input
    reference_dataset = all_pipeline_options['project'] + '.' + reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (
            p
            | 'Load Persons' >> BuildRootEntity(
                dataset=query_dataset,
                root_entity_class=entities.StatePerson,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set))

        # Get StateIncarcerationPeriods
        incarceration_periods = (
            p
            | 'Load IncarcerationPeriods' >> BuildRootEntity(
                dataset=query_dataset,
                root_entity_class=entities.StateIncarcerationPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionViolations
        supervision_violations = \
            (p
             | 'Load SupervisionViolations' >>
             BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StateSupervisionViolation,
                             unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True,
                             unifying_id_field_filter_set=person_id_filter_set,
                             state_code=state_code
                             ))

        # TODO(2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = \
            (p
             | 'Load SupervisionViolationResponses' >>
             BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StateSupervisionViolationResponse,
                             unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True,
                             unifying_id_field_filter_set=person_id_filter_set,
                             state_code=state_code
                             ))

        # Group StateSupervisionViolationResponses and
        # StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                'violations': supervision_violations,
                'violation_responses': supervision_violation_responses
            } | 'Group StateSupervisionViolationResponses to '
            'StateSupervisionViolations' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolation entities on
        # the corresponding StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | 'Set hydrated StateSupervisionViolations on '
            'the StateSupervisionViolationResponses' >> beam.ParDo(
                SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses
        # by person_id
        incarceration_periods_and_violation_responses = (
            {
                'incarceration_periods': incarceration_periods,
                'violation_responses':
                violation_responses_with_hydrated_violations
            }
            | 'Group StateIncarcerationPeriods to '
            'StateSupervisionViolationResponses' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolationResponse entities on
        # the corresponding StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | 'Set hydrated StateSupervisionViolationResponses on '
            'the StateIncarcerationPeriods' >> beam.ParDo(
                SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their StateIncarcerationPeriods
        person_and_incarceration_periods = (
            {
                'person':
                persons,
                'incarceration_periods':
                incarceration_periods_with_source_violations
            }
            | 'Group StatePerson to StateIncarcerationPeriods' >>
            beam.CoGroupByKey())

        # Bring in the table that associates people and their county of
        # residence
        person_id_to_county_query = \
            f"SELECT * FROM " \
            f"`{reference_dataset}.persons_to_recent_county_of_residence`"

        person_id_to_county_kv = (
            p
            | "Read person_id to county associations from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(query=person_id_to_county_query,
                                       use_standard_sql=True))
            | "Convert person_id to county association table to KV" >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Identify ReleaseEvents events from the StatePerson's
        # StateIncarcerationPeriods
        person_events = (
            person_and_incarceration_periods
            | "ClassifyReleaseEvents" >> beam.ParDo(
                ClassifyReleaseEvents(), AsDict(person_id_to_county_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get recidivism metrics
        recidivism_metrics = (person_events
                              |
                              'Get Recidivism Metrics' >> GetRecidivismMetrics(
                                  pipeline_options=all_pipeline_options,
                                  metric_types=metric_types_set))

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            recidivism_metrics
            | 'Convert to dict to be written to BQ' >> beam.ParDo(
                RecidivismMetricWritableDict()).with_outputs(
                    'rates', 'counts'))

        # Write the recidivism metrics to the output tables in BigQuery
        rates_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            ReincarcerationRecidivismRateMetric)
        counts_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            ReincarcerationRecidivismCountMetric)

        _ = (writable_metrics.rates
             | f"Write rate metrics to BQ table: {rates_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=rates_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (writable_metrics.counts
             | f"Write count metrics to BQ table: {counts_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=counts_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
예제 #28
0
def run():
    argv = [
        '--project={0}'.format(PROJECT), '--job_name=shq-demo-data-{}'.format(
            datetime.now().strftime('%Y%m%d%H%M%S')), '--save_main_session',
        '--requirements_file=requirements.txt',
        '--staging_location=gs://{0}/staging/'.format(BUCKET),
        '--temp_location=gs://{0}/staging/'.format(BUCKET),
        '--runner=DataflowRunner'
    ]

    # create the pipeline
    p = beam.Pipeline(argv=argv)

    # get pcollection of users
    # read rows (dicts) from BQ
    # convert offset into actual date relative to today
    users = (
        p
        | 'read users from BQ' >> beam.io.Read(
            beam.io.BigQuerySource(
                query=
                'SELECT * FROM [success-hq:datastore.user] order by email {}'.
                format(USER_LIMIT)))
        | 'get users with reg dates' >> beam.Map(get_user_with_regdate))

    # create list of companies and reg dates based on earliest user reg_date
    companies = (users
                 | 'get company and reg date from user' >>
                 beam.Map(get_company_and_regdate)
                 |
                 'find first reg_date for company' >> beam.CombinePerKey(min))

    # convert rows into datastore entities
    # write entities into datastore
    (users
     | 'build user entity' >> beam.Map(build_user_entity)
     | 'write user to Datastore' >> WriteToDatastore(PROJECT))

    # convert into datastore entities
    # write entities into datastore
    (companies
     | 'build company entity' >> beam.Map(build_company_entity)
     | 'write company to Datastore' >> WriteToDatastore(PROJECT))

    # create projects in datastore
    (companies
     | 'create project for company' >> beam.Map(build_project_entities)
     | 'write project to Datastore' >> WriteToDatastore(PROJECT))

    # create trending in datastore
    (companies
     | 'create trending for company' >> beam.Map(build_trending_entities)
     | 'write trending to Datastore' >> WriteToDatastore(PROJECT))

    # create events for company
    company_events = (
        companies
        | 'build company events' >> beam.FlatMap(build_company_events)
        | 'expand company events' >> beam.FlatMap(expand_events))

    # write company events into BQ
    (company_events
     | 'write to BQ table' >> beam.io.Write(
         beam.io.BigQuerySink(
             project=PROJECT,
             dataset=DATASET,
             table='company_events',
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))

    # find purchases for all companies
    purchases = (company_events
                 |
                 'get purchased amounts' >> beam.FlatMap(get_purchased_amounts)
                 | 'sum purchased amounts' >> beam.CombinePerKey(sum))

    # find provisions for all companies
    provisions = (
        company_events
        | 'get provisioned amounts' >> beam.FlatMap(get_provisioned_amounts)
        | 'sum provisioned amounts' >> beam.CombinePerKey(sum))

    # combine purchase and provision pcollections
    company_updates = {
        'purchased': purchases,
        'provisioned': provisions
    } | beam.CoGroupByKey()

    # write renewal records to datastore
    (company_updates
     | 'create renewal for company' >> beam.Map(build_renewal_entities)
     | 'write renewals to Datastore' >> WriteToDatastore(PROJECT))

    # create registration events for users
    reg_events = users | 'build reg events' >> beam.Map(build_reg_event)

    # create tickets events for users
    ticket_events = users | 'build ticket events' >> beam.FlatMap(
        lambda line: build_ticket_events(line))

    # create call events for users
    call_events = (users
                   | 'build call events' >> beam.FlatMap(build_call_events)
                   | 'expand call events' >> beam.FlatMap(expand_events))

    # combine the pcollections
    events = (reg_events, ticket_events, call_events) | beam.Flatten()

    # take daily collections and write them into bq
    (events
     | 'write to bq' >> beam.io.Write(
         beam.io.BigQuerySink(
             '{}:{}.{}'.format(PROJECT, DATASET, TEMP),
             write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)))

    # run the pipeline
    print 'waiting for pipeline to finish, bq partition still to come'
    print 'do not close cloud shell window'
    status = p.run().wait_until_finish()

    # copy stuff from temp into partitions
    print 'starting bq partition work'
    today = date.today()
    days_past = 182
    bq_client = bigquery.Client(project=PROJECT)
    bq_dataset = bq_client.dataset(DATASET)
    for index in range(0, days_past):
        query_day = (datetime.now() + timedelta(days=1 - index)).date()
        query_start = query_day.strftime('%Y-%m-%d 00:00:00')
        query_end = query_day.strftime('%Y-%m-%d 23:59:59')
        part_string = query_day.strftime('%Y%m%d')
        query = 'SELECT * FROM {}.{} where date >= "{}" and date <= "{}"'.format(
            DATASET, TEMP, query_start, query_end)
        bq_target = bq_dataset.table('user_events${}'.format(part_string))
        job = bq_client.run_async_query(
            'bq_load_{}'.format(datetime.now().strftime('%Y%m%d%H%M%S%f')),
            query)
        job.destination = bq_target
        job.write_disposition = 'WRITE_TRUNCATE'
        job.begin()
    print 'Done! You can close the Cloud Shell window'
예제 #29
0
파일: tfidf.py 프로젝트: zenkibomb/beam
    def expand(self, uri_to_content):

        # Compute the total number of documents, and prepare a singleton
        # PCollection to use as side input.
        total_documents = (uri_to_content
                           | 'GetUris 1' >> beam.Keys()
                           | 'GetUniqueUris' >> beam.Distinct()
                           | 'CountUris' >> beam.combiners.Count.Globally())

        # Create a collection of pairs mapping a URI to each of the words
        # in the document associated with that that URI.

        def split_into_words(uri_line):
            (uri, line) = uri_line
            return [(uri, w.lower()) for w in re.findall(r'[A-Za-z\']+', line)]

        uri_to_words = (uri_to_content
                        | 'SplitWords' >> beam.FlatMap(split_into_words))

        # Compute a mapping from each word to the total number of documents
        # in which it appears.
        word_to_doc_count = (
            uri_to_words
            | 'GetUniqueWordsPerDoc' >> beam.Distinct()
            | 'GetWords' >> beam.Values()
            | 'CountDocsPerWord' >> beam.combiners.Count.PerElement())

        # Compute a mapping from each URI to the total number of words in the
        # document associated with that URI.
        uri_to_word_total = (
            uri_to_words
            | 'GetUris 2' >> beam.Keys()
            | 'CountWordsInDoc' >> beam.combiners.Count.PerElement())

        # Count, for each (URI, word) pair, the number of occurrences of that word
        # in the document associated with the URI.
        uri_and_word_to_count = (
            uri_to_words
            | 'CountWord-DocPairs' >> beam.combiners.Count.PerElement())

        # Adjust the above collection to a mapping from (URI, word) pairs to counts
        # into an isomorphic mapping from URI to (word, count) pairs, to prepare
        # for a join by the URI key.
        def shift_keys(uri_word_count):
            return (uri_word_count[0][0], (uri_word_count[0][1],
                                           uri_word_count[1]))

        uri_to_word_and_count = (uri_and_word_to_count
                                 | 'ShiftKeys' >> beam.Map(shift_keys))

        # Perform a CoGroupByKey (a sort of pre-join) on the prepared
        # uri_to_word_total and uri_to_word_and_count tagged by 'word totals' and
        # 'word counts' strings. This yields a mapping from URI to a dictionary
        # that maps the above mentioned tag strings to an iterable containing the
        # word total for that URI and word and count respectively.
        #
        # A diagram (in which '[]' just means 'iterable'):
        #
        #   URI: {'word totals': [count],  # Total words within this URI's document.
        #         'word counts': [(word, count),  # Counts of specific words
        #                         (word, count),  # within this URI's document.
        #                         ... ]}
        uri_to_word_and_count_and_total = (
            {
                'word totals': uri_to_word_total,
                'word counts': uri_to_word_and_count
            }
            | 'CoGroupByUri' >> beam.CoGroupByKey())

        # Compute a mapping from each word to a (URI, term frequency) pair for each
        # URI. A word's term frequency for a document is simply the number of times
        # that word occurs in the document divided by the total number of words in
        # the document.

        def compute_term_frequency(uri_count_and_total):
            (uri, count_and_total) = uri_count_and_total
            word_and_count = count_and_total['word counts']
            # We have an iterable for one element that we want extracted.
            [word_total] = count_and_total['word totals']
            for word, count in word_and_count:
                yield word, (uri, float(count) / word_total)

        word_to_uri_and_tf = (
            uri_to_word_and_count_and_total
            | 'ComputeTermFrequencies' >> beam.FlatMap(compute_term_frequency))

        # Compute a mapping from each word to its document frequency.
        # A word's document frequency in a corpus is the number of
        # documents in which the word appears divided by the total
        # number of documents in the corpus.
        #
        # This calculation uses a side input, a Dataflow-computed auxiliary value
        # presented to each invocation of our MapFn lambda. The second argument to
        # the function (called total---note that the first argument is a tuple)
        # receives the value we listed after the lambda in Map(). Additional side
        # inputs (and ordinary Python values, too) can be provided to MapFns and
        # DoFns in this way.
        def div_word_count_by_total(word_count, total):
            (word, count) = word_count
            return (word, float(count) / total)

        word_to_df = (
            word_to_doc_count
            | 'ComputeDocFrequencies' >> beam.Map(
                div_word_count_by_total, AsSingleton(total_documents)))

        # Join the term frequency and document frequency collections,
        # each keyed on the word.
        word_to_uri_and_tf_and_df = (
            {
                'tf': word_to_uri_and_tf,
                'df': word_to_df
            }
            | 'CoGroupWordsByTf-df' >> beam.CoGroupByKey())

        # Compute a mapping from each word to a (URI, TF-IDF) score for each URI.
        # There are a variety of definitions of TF-IDF
        # ("term frequency - inverse document frequency") score; here we use a
        # basic version that is the term frequency divided by the log of the
        # document frequency.

        def compute_tf_idf(word_tf_and_df):
            (word, tf_and_df) = word_tf_and_df
            [docf] = tf_and_df['df']
            for uri, tf in tf_and_df['tf']:
                yield word, (uri, tf * math.log(1 / docf))

        word_to_uri_and_tfidf = (
            word_to_uri_and_tf_and_df
            | 'ComputeTf-idf' >> beam.FlatMap(compute_tf_idf))

        return word_to_uri_and_tfidf
예제 #30
0
def run(argv=None):
    # TODO: DROP indexes on purch log in DB

    parser = argparse.ArgumentParser()
    parser.add_argument('--startDate',
                        dest='startDate',
                        type=parse_string_date,
                        default=(datetime.now() - timedelta(days=30)).replace(
                            hour=0, minute=0, second=0, microsecond=0),
                        help='Start date.')
    parser.add_argument('--endDate',
                        dest='endDate',
                        type=parse_string_date,
                        default=datetime.now().replace(hour=23,
                                                       minute=59,
                                                       second=59,
                                                       microsecond=999999),
                        help='End date.')

    known_args, pipeline_args = parser.parse_known_args(argv)

    pg = PostgresDb()
    pipeline_options = PipelineOptions(pipeline_args)

    # clean dumped files
    pg.truncate_table_by_delete(
        table_name='stage.tmp_items_mean_purchase_time')

    time_boundaries_list = collection_range_timestamps(
        startDate=known_args.startDate,
        endDate=known_args.endDate,
        delta=timedelta(hours=1),
        return_as_list=True)

    #time_boundaries_list = collection_range_timestamps(startDate=datetime(2019, 1, 29, 0, 0, 0),
    #                                                   endDate=datetime(2019, 1, 29, 1, 0, 0),
    #                                                   delta=timedelta(hours=1),
    #                                                   return_as_list=True)

    # perform source data
    for time_boundaries_bulk_list in list_chunks(time_boundaries_list, 15):
        with beam.Pipeline(options=pipeline_options) as p:
            t_boundaries_sources = (p
                                    | 'next_time_boundaries_bulk' >>
                                    beam.Create(time_boundaries_bulk_list))

            purch_log_data = (
                t_boundaries_sources
                | 'sql_prepare_purchase_log' >> beam.ParDo(DoFnQueryPurchLog())
                | 'sql_execute_purchase_log' >> beam.ParDo(
                    DoFnExecuteSql(table_tag='purchase_log', pg_db=pg)))

            matches_players_data = (
                t_boundaries_sources
                | 'sql_prepare_matches_players' >> beam.ParDo(
                    DoFnQueryMatchesPlayers())
                | 'sql_execute_matches_players' >> beam.ParDo(
                    DoFnExecuteSql(table_tag='matches_players', pg_db=pg)))

            ({
                'purch_log': purch_log_data,
                'matches_players': matches_players_data
            }
             | 'group_by_match_id_player_num' >> beam.CoGroupByKey()
             | 'retrieve_heroes_items_purch_times' >> beam.ParDo(
                 DoFnRetrieveHeroesItemsPurchTime())
             | 'group_by_heroes' >> GroupByKey()
             | 'dump_stat_source' >> beam.ParDo(DoFnDumpStatSource(pg_db=pg)))

    with beam.Pipeline(options=pipeline_options) as p:
        (p
         | 'heroes_collection' >> collection_heroes()
         | 'calculate_purch_statistics' >> beam.ParDo(
             DoFnCalculatePurchStatistics(pg_db=pg)))