def generate_events(self):
    publish_client = pubsub.Client(project=self.project)
    topic = publish_client.topic(self.topic_name)
    sub = topic.subscription(self.subscription_name)

    logging.info('Generating auction events to topic %s', topic.name)

    if self.args.input.startswith('gs://'):
      from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem
      fs = GCSFileSystem(self.pipeline_options)
      with fs.open(self.args.input) as infile:
        for line in infile:
          topic.publish(line)
    else:
      with open(self.args.input) as infile:
        for line in infile:
          topic.publish(line)

    logging.info('Finished event generation.')

    # Read from PubSub into a PCollection.
    if self.args.subscription_name:
      raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub(
          subscription=sub.full_name)
    else:
      raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub(
          topic=topic.full_name)

    return raw_events
Ejemplo n.º 2
0
    def generate_events(self):
        publish_client = pubsub.Client(project=self.project)
        topic = publish_client.topic(self.topic_name)
        sub = topic.subscription(self.subscription_name)

        logging.info('Generating auction events to topic %s', topic.name)

        if self.args.input.startswith('gs://'):
            from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem
            fs = GCSFileSystem(self.pipeline_options)
            with fs.open(self.args.input) as infile:
                for line in infile:
                    topic.publish(line)
        else:
            with open(self.args.input) as infile:
                for line in infile:
                    topic.publish(line)

        logging.info('Finished event generation.')

        # Read from PubSub into a PCollection.
        if self.args.subscription_name:
            raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub(
                subscription=sub.full_name)
        else:
            raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub(
                topic=topic.full_name)
        raw_events = (
            raw_events
            | 'deserialization' >> beam.ParDo(nexmark_util.ParseJsonEvnetFn())
            | 'timestamping' >>
            beam.Map(lambda e: window.TimestampedValue(e, e.date_time)))
        return raw_events
Ejemplo n.º 3
0
    def generate_events(self):
        publish_client = pubsub.Client(project=self.project)
        topic = publish_client.topic(self.topic_name)
        sub = topic.subscription(self.subscription_name)

        logging.info('Generating auction events to topic %s', topic.name)

        if self.args.input.startswith('gs://'):
            from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem
            fs = GCSFileSystem(self.pipeline_options)
            with fs.open(self.args.input) as infile:
                for line in infile:
                    topic.publish(line)
        else:
            with open(self.args.input) as infile:
                for line in infile:
                    topic.publish(line)

        logging.info('Finished event generation.')

        # Read from PubSub into a PCollection.
        if self.args.subscription_name:
            raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub(
                subscription=sub.full_name)
        else:
            raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub(
                topic=topic.full_name)

        return raw_events
Ejemplo n.º 4
0
    def parse_element(self, element):
        message = json.loads(element.data)
        bucket = message['bucket']
        # Only import from the bucket we are expecting.
        if bucket != self.bucket_name:
            return []
        filepath = message['name']
        logging.info('Got file: %s, %s', bucket, filepath)
        logging.info('Got -: %s', message)
        logline_metadata = None
        #    try:
        # Split path component. Expecting logs/date/bundleId/env/
        path_comps = filepath.split('/')
        if len(path_comps) < 3 or (path_comps[3] != self.env
                                   and self.env is not None):
            logging.info('Skipping %s', filepath)
            return []
        name = path_comps[len(path_comps) - 1]
        if name.endswith('.txt'):
            name = name[0:len(name) - 4]
        name_comps = name.split('_')
        self.env = path_comps[3]
        self.log_name = 'client-logs-%s' % (
            self.env) if self.log_name is None else self.log_name
        logline_metadata = {
            'suffix': name_comps[2],
            'bundleId': path_comps[2],
            'env': path_comps[3],
            'phone': urllib2.unquote(name_comps[0]).decode('utf8'),
            'filepath': filepath
        }
        self.logline_metadata = logline_metadata
        logging.info('Got file: %s with %s', filepath, logline_metadata)

        if not self.gcs:
            # These imports have to be nested (ugh) because the constructor and the
            # main pipeline get evaluated locally when deploying remotely from
            # the cmdline, and this class is only available when running on GCS
            from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem
            self.gcs = GCSFileSystem(PipelineOptions(self.pipeline_args))
            self.logger = stackdriver_logging.Client().logger(self.log_name)

        # Read the whole file (ugh) from GCS. Without SDoFns support in Python, that's the best
        # we can do in dataflow right now.

        with self.gcs.open('gs://%s/%s' % (bucket, filepath),
                           mime_type='text/plain') as infile:
            for line in infile:
                if sys.getsizeof(line) > 1000:
                    lines = textwrap.wrap(line, 1000, break_long_words=False)
                    for text in lines:
                        self.writeLog(text)
                else:
                    self.writeLog(line)
        return []
Ejemplo n.º 5
0
    def _data_to_load(self,
                      gcs: GCSFileSystem,
                      scan_type: str,
                      incremental_load: bool,
                      table_name: str,
                      start_date: Optional[datetime.date] = None,
                      end_date: Optional[datetime.date] = None) -> List[str]:
        """Select the right files to read.

    Args:
      gcs: GCSFileSystem object
      scan_type: one of 'echo', 'discard', 'http', 'https'
      incremental_load: boolean. If true, only read the latest new data
      table_name: dataset.table name like 'base.scan_echo'
      start_date: date object, only files after or at this date will be read
      end_date: date object, only files at or before this date will be read

    Returns:
      A List of filename strings. ex
       ['gs://firehook-scans/echo/CP_Quack-echo-2020-08-22-06-08-03/results.json',
        'gs://firehook-scans/echo/CP_Quack-echo-2020-08-23-06-01-02/results.json']
    """
        if incremental_load:
            full_table_name = self._get_full_table_name(table_name)
            existing_sources = _get_existing_datasources(full_table_name)
        else:
            existing_sources = []

        # Both zipped and unzipped data to be read in
        zipped_regex = self.bucket + scan_type + '/**/results.json.gz'
        unzipped_regex = self.bucket + scan_type + '/**/results.json'

        zipped_metadata = [m.metadata_list
                           for m in gcs.match([zipped_regex])][0]
        unzipped_metadata = [
            m.metadata_list for m in gcs.match([unzipped_regex])
        ][0]
        file_metadata = zipped_metadata + unzipped_metadata

        filenames = [metadata.path for metadata in file_metadata]
        file_sizes = [metadata.size_in_bytes for metadata in file_metadata]

        filtered_filenames = [
            filename for (filename, file_size) in zip(filenames, file_sizes)
            if (_between_dates(filename, start_date, end_date)
                and _source_from_filename(filename) not in existing_sources
                and file_size != 0)
        ]
        return filtered_filenames
Ejemplo n.º 6
0
def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()
    # Here we add some specific command line arguments we expect.
    # Specifically we have the input file to load and the output table to
    # This is the final stage of the pipeline, where we define the destination
    # of the data.  In this case we are writing to BigQuery.

    parser.add_argument(
        '--input_subscription',
        required=True,
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))

    parser.add_argument('--output',
                        required=True,
                        help='Output bucket for data',
                        default='')
    parser.add_argument('--log', required=True, help='log bucket', default='')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)
    #import pprint
    #pprint.pprint(known_args)
    #pprint.pprint(pipeline_args)
    #pprint.pprint(pipeline_options.get_all_options())

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True

    # get options
    project_id = pipeline_options.get_all_options()['project']
    output_bucket_name = known_args.output
    log_bucket_name = known_args.log
    log_file_path = 'gs://{}/logs'.format(log_bucket_name)

    fs = GCSFileSystem(pipeline_options=pipeline_options)

    # DataIngestion is a class we built in this script to hold the logic for
    # transforming the file into a BigQuery table.
    data_copier = DataCopier()
    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information including where Dataflow should
    # store temp files, and what the project id is

    p = beam.Pipeline(options=pipeline_options)

    (p | beam.io.ReadFromPubSub(subscription=known_args.input_subscription)
     | 'Copying customer data to the final data-bucket/customer-id' >>
     beam.Map(lambda m: data_copier.parse_method(m, project_id, fs,
                                                 output_bucket_name))
     | 'Write results to the output bucket' >>
     WriteToText(file_path_prefix=log_file_path))

    p.run().wait_until_finish()
Ejemplo n.º 7
0
    def run_beam_pipeline(self, scan_type: str, incremental_load: bool,
                          job_name: str, table_name: str,
                          start_date: Optional[datetime.date],
                          end_date: Optional[datetime.date]) -> None:
        """Run a single apache beam pipeline to load json data into bigquery.

    Args:
      scan_type: one of 'echo', 'discard', 'http', 'https' or 'satellite'
      incremental_load: boolean. If true, only load the latest new data, if
        false reload all data.
      job_name: string name for this pipeline job.
      table_name: dataset.table name like 'base.scan_echo'
      start_date: date object, only files after or at this date will be read.
        Mostly only used during development.
      end_date: date object, only files at or before this date will be read.
        Mostly only used during development.

    Raises:
      Exception: if any arguments are invalid or the pipeline fails.
    """
        logging.getLogger().setLevel(logging.INFO)
        pipeline_options = self._get_pipeline_options(scan_type, job_name)
        gcs = GCSFileSystem(pipeline_options)

        new_filenames = self._data_to_load(gcs, scan_type, incremental_load,
                                           table_name, start_date, end_date)
        if not new_filenames:
            logging.info('No new files to load')
            return

        with beam.Pipeline(options=pipeline_options) as p:
            # PCollection[Tuple[filename,line]]
            lines = _read_scan_text(p, new_filenames)

            if scan_type == satellite.SCAN_TYPE_SATELLITE:
                # PCollection[Row], PCollection[Row]
                satellite_rows, blockpage_rows = satellite.process_satellite_lines(
                    lines)

                # PCollection[Row]
                rows_with_metadata = self._add_metadata(satellite_rows)

                self._write_to_bigquery(
                    satellite.SCAN_TYPE_BLOCKPAGE, blockpage_rows,
                    satellite.get_blockpage_table_name(table_name, scan_type),
                    incremental_load)
            else:  # Hyperquack scans
                # PCollection[Row]
                rows = (lines | 'flatten json' >> beam.ParDo(
                    flatten.FlattenMeasurement()).with_output_types(Row))

                # PCollection[Row]
                rows_with_metadata = self._add_metadata(rows)

            _raise_error_if_collection_empty(rows_with_metadata)

            self._write_to_bigquery(scan_type, rows_with_metadata, table_name,
                                    incremental_load)
Ejemplo n.º 8
0
def get_filesystem(path):
    """Function that returns the FileSystem class to use based on the path
  provided in the input.
  """
    if path.startswith('gs://'):
        from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem
        return GCSFileSystem()
    else:
        return LocalFileSystem()
Ejemplo n.º 9
0
    def generate_events(self):
        from google.cloud import pubsub
        publish_client = pubsub.Client(project=self.project)
        topic = publish_client.topic(self.topic_name)

        logging.info('Generating auction events to topic %s', topic.name)

        if self.args.input.startswith('gs://'):
            from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem
            fs = GCSFileSystem(self.pipeline_options)
            with fs.open(self.args.input) as infile:
                for line in infile:
                    topic.publish(line)
        else:
            with open(self.args.input) as infile:
                for line in infile:
                    topic.publish(line)

        logging.info('Finished event generation.')
Ejemplo n.º 10
0
def get_filesystem(path):
    """Function that returns the FileSystem class to use based on the path
  provided in the input.
  """
    if path.startswith('gs://'):
        try:
            from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem
        except ImportError:
            raise ImportError('Google Cloud Platform IO not available, '
                              'please install apache_beam[gcp]')
        return GCSFileSystem()
    return LocalFileSystem()
Ejemplo n.º 11
0
    def _data_to_load(self,
                      gcs: GCSFileSystem,
                      scan_type: str,
                      incremental_load: bool,
                      table_name: str,
                      start_date: Optional[datetime.date] = None,
                      end_date: Optional[datetime.date] = None) -> List[str]:
        """Select the right files to read.

    Args:
      gcs: GCSFileSystem object
      scan_type: one of 'echo', 'discard', 'http', 'https', 'satellite'
      incremental_load: boolean. If true, only read the latest new data
      table_name: dataset.table name like 'base.scan_echo'
      start_date: date object, only files after or at this date will be read
      end_date: date object, only files at or before this date will be read

    Returns:
      A List of filename strings. ex
       ['gs://firehook-scans/echo/CP_Quack-echo-2020-08-22-06-08-03/results.json',
        'gs://firehook-scans/echo/CP_Quack-echo-2020-08-23-06-01-02/results.json']
    """
        if incremental_load:
            full_table_name = self._get_full_table_name(table_name)
            existing_sources = _get_existing_datasources(full_table_name)
        else:
            existing_sources = []

        if scan_type == satellite.SCAN_TYPE_SATELLITE:
            files_to_load = flatten_satellite.SATELLITE_FILES
        else:
            files_to_load = SCAN_FILES

        # Filepath like `gs://firehook-scans/echo/**/*'
        files_regex = f'{self.bucket}{scan_type}/**/*'
        file_metadata = [m.metadata_list for m in gcs.match([files_regex])][0]

        filepaths = [metadata.path for metadata in file_metadata]
        file_sizes = [metadata.size_in_bytes for metadata in file_metadata]

        filtered_filenames = [
            filepath for (filepath, file_size) in zip(filepaths, file_sizes)
            if (_between_dates(filepath, start_date, end_date)
                and _filename_matches(filepath, files_to_load)
                and flatten_base.source_from_filename(filepath) not in
                existing_sources and file_size > EMPTY_GZIPPED_FILE_SIZE)
        ]

        return filtered_filenames
Ejemplo n.º 12
0
def run():
    p = beam.Pipeline(options=PipelineOptions())
    gcs = GCSFileSystem(PipelineOptions())
    pattern_1 = [
        'gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untarI20180130/DESIGN/USD0808610-20180130.ZIP']
    input_pattern = ['gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untar*/**/*.ZIP']
    input_pattern_1 = 'gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untar*/**/*.ZIP'

    parent_zip = 'gs://bulk_pdfimages_dump/bulkdata.uspto.gov/data/patent/grant/redbook/2010/I20100202.zip'

    result = [m.metadata_list for m in gcs.match(input_pattern)]

    metadata_list = result.pop()

    print 'satya'
    parts = (p
             # | 'Match Files' >> fileio.MatchFiles(pattern_1)
             | 'Return nested files' >> beam.Create(metadata_list)
             # | 'print Files' >> beam
             | 'Print read file' >> beam.ParDo(ImageExtract())
             # | 'one' >> beam.Map()
             )

    p.run().wait_until_finish()
Ejemplo n.º 13
0
    def run_beam_pipeline(self, scan_type: str, incremental_load: bool,
                          job_name: str, table_name: str,
                          start_date: Optional[datetime.date],
                          end_date: Optional[datetime.date]) -> None:
        """Run a single apache beam pipeline to load json data into bigquery.

    Args:
      scan_type: one of 'echo', 'discard', 'http', 'https'
      incremental_load: boolean. If true, only load the latest new data, if
        false reload all data.
      job_name: string name for this pipeline job.
      table_name: dataset.table name like 'base.scan_echo'
      start_date: date object, only files after or at this date will be read.
        Mostly only used during development.
      end_date: date object, only files at or before this date will be read.
        Mostly only used during development.

    Raises:
      Exception: if any arguments are invalid or the pipeline fails.
    """
        logging.getLogger().setLevel(logging.INFO)
        pipeline_options = self._get_pipeline_options(scan_type, job_name)
        gcs = GCSFileSystem(pipeline_options)

        new_filenames = self._data_to_load(gcs, scan_type, incremental_load,
                                           table_name, start_date, end_date)
        if not new_filenames:
            logging.info('No new files to load incrementally')
            return

        with beam.Pipeline(options=pipeline_options) as p:
            # PCollection[Tuple[filename,line]]
            lines = _read_scan_text(p, new_filenames)

            # PCollection[Row]
            rows = (
                lines | 'flatten json' >>
                beam.FlatMapTuple(_flatten_measurement).with_output_types(Row))

            # PCollection[Row]
            rows_with_metadata = self._add_metadata(rows)

            self._write_to_bigquery(rows_with_metadata, table_name,
                                    incremental_load)
Ejemplo n.º 14
0
class ReadGCSNotifications(beam.PTransform):
    def __init__(self, env, bucket_name, log_name, pipeline_args):
        self.bucket_name = bucket_name
        self.env = env
        self.gcs = None
        self.pipeline_args = pipeline_args
        self.log_name = log_name

    def parse_element(self, element):
        message = json.loads(element.data)
        bucket = message['bucket']
        # Only import from the bucket we are expecting.
        if bucket != self.bucket_name:
            return []
        filepath = message['name']
        logging.info('Got file: %s, %s', bucket, filepath)
        logging.info('Got -: %s', message)
        logline_metadata = None
        #    try:
        # Split path component. Expecting logs/date/bundleId/env/
        path_comps = filepath.split('/')
        if len(path_comps) < 3 or (path_comps[3] != self.env
                                   and self.env is not None):
            logging.info('Skipping %s', filepath)
            return []
        name = path_comps[len(path_comps) - 1]
        if name.endswith('.txt'):
            name = name[0:len(name) - 4]
        name_comps = name.split('_')
        self.env = path_comps[3]
        self.log_name = 'client-logs-%s' % (
            self.env) if self.log_name is None else self.log_name
        logline_metadata = {
            'suffix': name_comps[2],
            'bundleId': path_comps[2],
            'env': path_comps[3],
            'phone': urllib2.unquote(name_comps[0]).decode('utf8'),
            'filepath': filepath
        }
        self.logline_metadata = logline_metadata
        logging.info('Got file: %s with %s', filepath, logline_metadata)

        if not self.gcs:
            # These imports have to be nested (ugh) because the constructor and the
            # main pipeline get evaluated locally when deploying remotely from
            # the cmdline, and this class is only available when running on GCS
            from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem
            self.gcs = GCSFileSystem(PipelineOptions(self.pipeline_args))
            self.logger = stackdriver_logging.Client().logger(self.log_name)

        # Read the whole file (ugh) from GCS. Without SDoFns support in Python, that's the best
        # we can do in dataflow right now.

        with self.gcs.open('gs://%s/%s' % (bucket, filepath),
                           mime_type='text/plain') as infile:
            for line in infile:
                if sys.getsizeof(line) > 1000:
                    lines = textwrap.wrap(line, 1000, break_long_words=False)
                    for text in lines:
                        self.writeLog(text)
                else:
                    self.writeLog(line)
        return []

    def writeLog(self, text):
        severity_pattern = re.compile('^([A-Za-z]+)')
        severity_remappings = {
            'TRACE': 'DEBUG',
            'LOG': 'DEBUG',
            'WARN': 'WARNING',
            'CRIT': 'CRITICAL'
        }
        # Build log element from message, and labels from metadata
        log_element = dict(self.logline_metadata)
        log_element['msg'] = text

        # Try to parse out the severity from the start of the line
        # And try and make sure it maps to a valid SD severity
        match = severity_pattern.match(text)
        if match:
            log_severity = match.group(1).upper()
            log_severity = severity_remappings.get(log_severity, log_severity)
            try:
                # Write the struct to SD using the hopefully valid severity
                self.logger.log_struct(log_element, severity=log_severity)
            except:
                # Write the struct to SD without a severity
                self.logger.log_struct(log_element)
        else:
            # Write the struct to SD without a severity
            self.logger.log_struct(log_element)

    def expand(self, pcoll):
        return pcoll | 'ReadGCSNotifications' >> beam.FlatMap(
            self.parse_element)
Ejemplo n.º 15
0
    def parse_element(self, element):

        if not self.gcs:
            # These imports have to be nested (ugh) because the constructor and the
            # main pipeline get evaluated locally when deploying remotely from
            # the cmdline, and this class is only available when running on GCS
            from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem
            self.gcs = GCSFileSystem(PipelineOptions(self.pipeline_args))
            self.logger = stackdriver_logging.Client().logger(self.log_name)

        message = json.loads(element.data)
        bucket = message['bucket']

        # Only import from the bucket we are expecting.
        if bucket != self.bucket_name:
            return []

        filepath = message['name']
        logging.info('Got file: %s, %s', bucket, filepath)

        logging.info('Got -: %s', message)

        logline_metadata = None

        #    try:
        # Split path component. Expecting logs/bundleId/env/
        path_comps = filepath.split('/')

        if len(path_comps) < 3 or path_comps[2] != self.env:
            logging.info('Skipping %s', filepath)

            return []

        name = path_comps[len(path_comps) - 1]
        if name.endswith('.txt'):
            name = name[0:len(name) - 4]
        name_comps = name.split('_')

        logline_metadata = {
            'suffix': name_comps[2],
            'bundleId': path_comps[1],
            'env': path_comps[2],
            'phone': urllib2.unquote(name_comps[0]).decode('utf8'),
            'filepath': filepath
        }
        #   except:
        #     logging.warn("Couldn't read metadata for %s", filepath)
        #     return []

        logging.info('Got file: %s with %s', filepath, logline_metadata)

        severity_pattern = re.compile('^([A-Za-z]+)')
        severity_remappings = {
            'TRACE': 'DEBUG',
            'LOG': 'DEBUG',
            'WARN': 'WARNING',
            'CRIT': 'CRITICAL'
        }

        # Read the whole file (ugh) from GCS. Without SDoFns support in Python, that's the best
        # we can do in dataflow right now.

        with self.gcs.open('gs://%s/%s' % (bucket, filepath),
                           mime_type='text/plain') as infile:

            for line in infile:

                # Build log element from message, and labels from metadata
                log_element = dict(logline_metadata)
                log_element['msg'] = line

                # Try to parse out the severity from the start of the line
                # And try and make sure it maps to a valid SD severity
                match = severity_pattern.match(line)
                if match:
                    log_severity = match.group(1).upper()
                    log_severity = severity_remappings.get(
                        log_severity, log_severity)

                    try:
                        # Write the struct to SD using the hopefully valid severity
                        self.logger.log_struct(log_element,
                                               severity=log_severity)
                    except:
                        # Write the struct to SD without a severity
                        self.logger.log_struct(log_element)

                else:
                    # Write the struct to SD without a severity
                    self.logger.log_struct(log_element)

        return []
Ejemplo n.º 16
0
 def tearDownClass(cls):
     GCSFileSystem(pipeline_options=PipelineOptions()) \
         .delete([cls.staging_bucket_name])
Ejemplo n.º 17
0
 def tearDown(self):
     GCSFileSystem(pipeline_options=PipelineOptions()) \
         .delete([self.staging_bucket_name])