def test_dicom_store_instance_from_gcs(self): # Store DICOM files to a empty DICOM store from a GCS bucket, # then check if the store metadata match. input_dict_store = {} input_dict_store['project_id'] = self.project input_dict_store['region'] = REGION input_dict_store['dataset_id'] = DATA_SET_ID input_dict_store['dicom_store_id'] = self.temp_dicom_store expected_output = [True] * NUM_INSTANCE with self.test_pipeline as p: gcs_path = DICOM_FILES_PATH + "/io_test_files/*" results = (p | fileio.MatchFiles(gcs_path) | fileio.ReadMatches() | UploadToDicomStore(input_dict_store, 'fileio') | beam.Map(lambda x: x['success'])) assert_that(results, equal_to(expected_output), label='store first assert') # Check the metadata using client result, status_code = DicomApiHttpClient().qido_search( self.project, REGION, DATA_SET_ID, self.temp_dicom_store, 'instances') self.assertEqual(status_code, 200) # List comparison based on different version of python self.assertCountEqual(result, self.expected_output_all_metadata)
def expand(self, root): # TODO(robertwb): Handle streaming (with explicit schema). paths_pcoll = root | beam.Create([self.path]) first = io.filesystems.FileSystems.match( [self.path], limits=[1])[0].metadata_list[0].path with io.filesystems.FileSystems.open(first) as handle: if not self.binary: handle = TextIOWrapper(handle) if self.incremental: sample = next( self.reader(handle, *self.args, **dict(self.kwargs, chunksize=100))) else: sample = self.reader(handle, *self.args, **self.kwargs) pcoll = (paths_pcoll | fileio.MatchFiles(self.path) | beam.Reshuffle() | fileio.ReadMatches() | beam.ParDo( _ReadFromPandasDoFn(self.reader, self.args, self.kwargs, self.binary, self.incremental, self.splitter))) from apache_beam.dataframe import convert return convert.to_dataframe(pcoll, proxy=_prefix_range_index_with( ':', sample[:0]))
def test_dicom_store_instance(self): # Store DICOM files to a empty DICOM store from a GCS bucket, # then check if the store metadata match. input_dict = {} input_dict['project_id'] = self.project input_dict['region'] = REGION input_dict['dataset_id'] = DATA_SET_ID input_dict['dicom_store_id'] = self.temp_dicom_store input_dict['search_type'] = "instances" expected_dict = {} expected_dict['result'] = self.expected_output_metadata expected_dict['status'] = 200 expected_dict['input'] = input_dict expected_dict['success'] = True with TestPipeline() as p: gcs_path = DICOM_FILES_PATH + "/*" results = (p | fileio.MatchFiles(gcs_path) | fileio.ReadMatches() | UploadToDicomStore(input_dict, 'fileio') | beam.Map(lambda x: x['success'])) assert_that(results, equal_to([True] * NUM_INSTANCE)) with TestPipeline() as p: results = (p | beam.Create([input_dict]) | DicomSearch()) assert_that(results, equal_to([expected_dict]))
def test_write_to_dynamic_destination(self): sink_params = [ fileio.TextSink, # pass a type signature fileio.TextSink() # pass a FileSink object ] for sink in sink_params: dir = self._new_tempdir() with TestPipeline() as p: _ = (p | "Create" >> beam.Create(range(100)) | beam.Map(lambda x: str(x)) | fileio.WriteToFiles( path=dir, destination=lambda n: "odd" if int(n) % 2 else "even", sink=sink, file_naming=fileio.destination_prefix_naming("test"))) with TestPipeline() as p: result = ( p | fileio.MatchFiles(FileSystems.join(dir, '*')) | fileio.ReadMatches() | beam.Map(lambda f: ( os.path.basename(f.metadata.path).split('-')[0], sorted(map(int, f.read_utf8().strip().split('\n')))))) assert_that( result, equal_to([('odd', list(range(1, 100, 2))), ('even', list(range(0, 100, 2)))]))
def ReadImagesFromDisk(pipeline: beam.Pipeline, base_path: Text) -> beam.pvalue.PCollection: """ The Beam PTransform used to load a collection of images and metadata from a local file system or a remote cloud storage bucket. Args: pipeline (beam.Pipeline): Input beam.Pipeline object coming from a TFX Executor. base_path (Text): Base directory containing images and labels. """ wildcard_qualifier = "*" # ingest all the files from the base path by supplying the wildcard file_pattern = os.path.join(base_path, wildcard_qualifier) allowed_ext = [".jpg", ".json", ".png", ".txt", ".jpeg"] images, label_file = ( pipeline | fileio.MatchFiles(file_pattern) | fileio.ReadMatches() | beam.Map(read_file_content) | "FilterOutFiles" >> beam.Filter(lambda x: x[FILE_EXT] in allowed_ext) | "SplitLabelFile" >> beam.Partition(SplitByFileName, 2)) # label_file is actually a dict label_dict = beam.pvalue.AsSingleton(label_file) ready_images = ( images | "AddLabelAndMetadata" >> beam.Map(add_label_and_metadata, label_dict)) return ready_images
def expand(self, root): paths_pcoll = root | beam.Create([self.path]) match = io.filesystems.FileSystems.match([self.path], limits=[1])[0] if not match.metadata_list: # TODO(BEAM-12031): This should be allowed for streaming pipelines if # user provides an explicit schema. raise FileNotFoundError(f"Found no files that match {self.path!r}") first_path = match.metadata_list[0].path with io.filesystems.FileSystems.open(first_path) as handle: if not self.binary: handle = TextIOWrapper(handle) if self.incremental: sample = next( self.reader(handle, *self.args, **dict(self.kwargs, chunksize=100))) else: sample = self.reader(handle, *self.args, **self.kwargs) pcoll = (paths_pcoll | fileio.MatchFiles(self.path) | beam.Reshuffle() | fileio.ReadMatches() | beam.ParDo( _ReadFromPandasDoFn(self.reader, self.args, self.kwargs, self.binary, self.incremental, self.splitter))) from apache_beam.dataframe import convert return convert.to_dataframe(pcoll, proxy=_prefix_range_index_with( ':', sample[:0]))
def test_write_to_different_file_types_some_spilling(self): dir = self._new_tempdir() with TestPipeline() as p: _ = ( p | beam.Create(WriteFilesTest.SIMPLE_COLLECTION) | beam.io.fileio.WriteToFiles( path=dir, destination=lambda record: record['foundation'], sink=lambda dest: ( WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS) if dest == 'apache' else WriteFilesTest.JsonSink()), file_naming=fileio.destination_prefix_naming(), max_writers_per_bundle=1)) with TestPipeline() as p: cncf_res = ( p | fileio.MatchFiles(FileSystems.join(dir, 'cncf*')) | fileio.ReadMatches() | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n')) | beam.Map(json.loads)) apache_res = ( p | "MatchApache" >> fileio.MatchFiles(FileSystems.join(dir, 'apache*')) | "ReadApache" >> fileio.ReadMatches() | "MapApache" >> beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf)))) assert_that( cncf_res, equal_to([ row for row in self.SIMPLE_COLLECTION if row['foundation'] == 'cncf' ]), label='verifyCNCF') assert_that( apache_res, equal_to([[row['project'], row['foundation']] for row in self.SIMPLE_COLLECTION if row['foundation'] == 'apache']), label='verifyApache')
def test_basic_two_files(self): files = [] tempdir = '%s%s' % (self._new_tempdir(), os.sep) # Create a couple files to be matched files.append(self._create_temp_file(dir=tempdir)) files.append(self._create_temp_file(dir=tempdir)) with TestPipeline() as p: files_pc = p | fileio.MatchFiles(tempdir) | beam.Map(lambda x: x.path) assert_that(files_pc, equal_to(files))
def run(): p = beam.Pipeline(options=PipelineOptions()) # gcs = GCSFileSystem(PipelineOptions()) # 'gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untarI20180130/DESIGN/USD0808610-20180130.ZIP' # input_pattern = ['gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untar*/**/*.ZIP'] input_pattern_1 = 'gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untar*/**/*.ZIP' # result = [m.metadata_list for m in gcs.match(input_pattern)] (p | 'Match Files' >> fileio.MatchFiles(input_pattern_1) | 'Print read file' >> beam.ParDo(ImageExtract())) p.run().wait_until_finish()
def main(runner): project_name = 'get-signal-types' timestamp = datetime.now(pytz.timezone('US/Pacific')).__str__() \ .replace(":", "") \ .replace(" ", "-") \ .replace(".", "") options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'vilago-demo' google_cloud_options.job_name = '%s-%s' % (project_name, timestamp) if runner == "dataflow": google_cloud_options.staging_location = 'gs://vilago-dataflow-output/staging/' google_cloud_options.temp_location = 'gs://vilago-dataflow-output/temp/' options.view_as(StandardOptions).runner = 'DataflowRunner' outputdir = 'gs://vilago-dataflow-output/output/%s' % timestamp inputfiles = 'gs://tuh-eeg-corpus/seizure-v1.5.0/edf/train/*/*/*/*/*.edf' elif runner == "dataflow-test": # same as dataflow except on fewer files google_cloud_options.staging_location = 'gs://vilago-dataflow-output/staging/' google_cloud_options.temp_location = 'gs://vilago-dataflow-output/temp/' options.view_as(StandardOptions).runner = 'DataflowRunner' outputdir = 'gs://vilago-dataflow-output/output/%s' % timestamp inputfiles = 'gs://tuh-eeg-corpus/seizure-v1.5.0/edf/train/01_tcp_ar/002/*/*/*.edf' else: google_cloud_options.staging_location = 'staging/' google_cloud_options.temp_location = 'temp' options.view_as(StandardOptions).runner = 'DirectRunner' outputdir = 'directrunner-output/outputs' inputfiles = 'gs://tuh-eeg-corpus/seizure-v1.5.0/edf/train/01_tcp_ar/002/00000254/*/*.edf' print('Starting beam job with runner %s...' % options.view_as(StandardOptions).runner) p = beam.Pipeline(options=options) files = p | 'Match EDF files' >> fileio.MatchFiles(inputfiles) files = prevent_fusion(files | 'Extract EDF path' >> beam.Map(lambda x: x.path)) files \ | 'Parse EDF' >> beam.ParDo(GetSignalTypes()) \ | 'Pair with 1' >> beam.ParDo(PairWithOne()) \ | 'Group' >> beam.GroupByKey() \ | 'Sum' >> beam.Map(lambda t: (t[0], sum(t[1]))) \ | 'Format' >> beam.Map(lambda t: '%s, %d' % t) \ | 'Output to folder' >> beam.io.WriteToText(outputdir, file_name_suffix='.txt') p.run() print('Beam job finished')
def file_process_pattern_access_metadata(): import apache_beam as beam from apache_beam.io import fileio # [START FileProcessPatternAccessMetadataSnip1] with beam.Pipeline() as p: readable_files = (p | fileio.MatchFiles('hdfs://path/to/*.txt') | fileio.ReadMatches() | beam.Reshuffle()) files_and_contents = (readable_files | beam.Map(lambda x: (x.metadata.path, x.read_utf8())))
def expand(self, root): # TODO(robertwb): Handle streaming (with explicit schema). paths_pcoll = root | beam.Create([self.path]) first = io.filesystems.FileSystems.match([self.path], limits=[1 ])[0].metadata_list[0].path with io.filesystems.FileSystems.open(first) as handle: df = next(self.reader(handle, *self.args, chunksize=100, **self.kwargs)) pcoll = ( paths_pcoll | fileio.MatchFiles(self.path) | fileio.ReadMatches() | beam.ParDo(_ReadFromPandasDoFn(self.reader, self.args, self.kwargs))) from apache_beam.dataframe import convert return convert.to_dataframe( pcoll, proxy=_prefix_range_index_with(':', df[:0]))
def test_write_to_single_file_batch(self): dir = self._new_tempdir() with TestPipeline() as p: _ = (p | beam.Create(WriteFilesTest.SIMPLE_COLLECTION) | "Serialize" >> beam.Map(json.dumps) | beam.io.fileio.WriteToFiles(path=dir)) with TestPipeline() as p: result = (p | fileio.MatchFiles(FileSystems.join(dir, '*')) | fileio.ReadMatches() | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n')) | beam.Map(json.loads)) assert_that(result, equal_to([row for row in self.SIMPLE_COLLECTION]))
def test_streaming_different_file_types(self): dir = self._new_tempdir() input = iter(WriteFilesTest.SIMPLE_COLLECTION) ts = (TestStream().advance_watermark_to(0).add_elements( [next(input), next(input)]).advance_watermark_to(10).add_elements( [next(input), next(input)]).advance_watermark_to(20).add_elements([ next(input), next(input) ]).advance_watermark_to(30).add_elements([ next(input), next(input) ]).advance_watermark_to(40).advance_watermark_to_infinity()) def no_colon_file_naming(*args): file_name = fileio.destination_prefix_naming()(*args) return file_name.replace(':', '_') with TestPipeline() as p: _ = (p | ts | beam.WindowInto(FixedWindows(10)) | beam.io.fileio.WriteToFiles( path=dir, destination=lambda record: record['foundation'], sink=lambda dest: (WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS) if dest == 'apache' else WriteFilesTest.JsonSink()), file_naming=no_colon_file_naming, max_writers_per_bundle=0, )) with TestPipeline() as p: cncf_files = (p | fileio.MatchFiles(FileSystems.join(dir, 'cncf*')) | "CncfFileNames" >> beam.Map(lambda fm: fm.path)) apache_files = (p | "MatchApache" >> fileio.MatchFiles( FileSystems.join(dir, 'apache*')) | "ApacheFileNames" >> beam.Map(lambda fm: fm.path)) assert_that( cncf_files, matches_all([ stringmatches.matches_regexp( '.*cncf-1970-01-01T00_00_00-1970-01-01T00_00_10--.*'), stringmatches.matches_regexp( '.*cncf-1970-01-01T00_00_10-1970-01-01T00_00_20--.*'), stringmatches.matches_regexp( '.*cncf-1970-01-01T00_00_20-1970-01-01T00_00_30--.*'), stringmatches.matches_regexp( '.*cncf-1970-01-01T00_00_30-1970-01-01T00_00_40--.*') ]), label='verifyCNCFFiles') assert_that( apache_files, matches_all([ stringmatches.matches_regexp( '.*apache-1970-01-01T00_00_00-1970-01-01T00_00_10--.*' ), stringmatches.matches_regexp( '.*apache-1970-01-01T00_00_10-1970-01-01T00_00_20--.*' ), stringmatches.matches_regexp( '.*apache-1970-01-01T00_00_20-1970-01-01T00_00_30--.*' ), stringmatches.matches_regexp( '.*apache-1970-01-01T00_00_30-1970-01-01T00_00_40--.*') ]), label='verifyApacheFiles')
def run(argv=None): class DataIngestion: def parse_method(self, string_input): values = re.split(",", re.sub('\r\n', '', re.sub(u'"', '', string_input))) row = dict( zip(('Requisition_Number', 'Opportunity_Title', 'Opportunity_Status', 'Featured', 'Company_Code', 'Company', 'Entity', 'Entity_Desc', 'Source_Job_Code', 'Job_Title', 'FullTime_Or_PartTime', 'Salary_Or_Hourly', 'Recruiter', 'Location_Name', 'Date_Applied', 'Source', 'Step', 'Step_Date', 'Recruiting_Hire_Date', 'Start_Date', 'Candidate', 'Candidate_Email_Address', 'Candidate_Primary_Phone', 'First_Published_Date', 'Average_Days_Between_Publish_Hire_Dates'), values)) return row parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', required=False, help='Input file to read. This can be a local file or ' 'a file in a Google Storage Bucket.', # default='gs://hc_crackerbarrel_ats/Historical' ) parser.add_argument('--output', dest='output', required=False, help='Output BQ table to write results to.', default='chs.ats_master') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ '--runner=DataflowRunner', '--project=hireclix', '--region=us-east1', '--staging_location=gs://hc_chs_ats/File_Temp/Source', '--temp_location=gs://hc_chs_ats/File_Temp/Staging', '--job_name=chstest1' ]) data_ingestion = DataIngestion() with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p: readable_files = (p | 'Matching .csv files' >> fileio.MatchFiles('gs://hc_chs_ats/File_Temp/Temp_File/*.csv') | 'Read Matches' >> fileio.ReadMatches() | 'Rebalance data inputs' >> beam.Reshuffle()) files_and_content = (readable_files | 'Determine FilePath' >> beam.Map(lambda x: x.metadata.path)) writebq = (files_and_content | 'Read from a File' >> beam.io.ReadAllFromText(skip_header_lines=1) | 'String To BigQuery Row' >> beam.Map(lambda s: data_ingestion.parse_method(s)) | 'Write to BigQuery' >> beam.io.Write( beam.io.WriteToBigQuery(known_args.output, schema='Requisition_Number:STRING,' 'Opportunity_Title:STRING,' 'Opportunity_Status:STRING,' 'Featured:STRING,' 'Company_Code:STRING,' 'Company:STRING,' 'Entity:STRING,' 'Entity_Desc:STRING,' 'Source_Job_Code:STRING,' 'Job_Title:STRING,' 'FullTime_Or_PartTime:STRING,' 'Salary_Or_Hourly:STRING,' 'Recruiter:STRING,' 'Location_Name:STRING,' 'Date_Applied:DATE,' 'Source:STRING,' 'Step:STRING,' 'Step_Date:DATE,' 'Recruiting_Hire_Date:DATE,' 'Start_Date:DATE,' 'Candidate:STRING,' 'Candidate_Email_Address:STRING,' 'Candidate_Primary_Phone:STRING,' 'First_Published_Date:DATE,' 'Average_Days_Between_Publish_Hire_Dates:STRING', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) from apache_beam.io import fileio with beam.Pipeline(options=PipelineOptions()) as p: output = (p | "match" >> fileio.MatchFiles(known_args.input) | "read match" >> fileio.ReadMatches() | "read_file" >> beam.Map(lambda x: (x.metadata.path, x.read_utf8())) | "parse file" >> beam.Map(lambda x: (parse_filename(x[0]), parse_file(x[1].split('\n')) )) | "unfold" >> beam.ParDo(MetaAndContent()) # | "debug" >> beam.FlatMap(lambda x: print(x)) ) table_spec = 'brainscode-140622:tf2up.conversions' table_schema = {'fields': [ {'name': 'date', 'type': 'DATE'}, {'name': 'file_hash', 'type': 'STRING'}, {'name': 'line', 'type': 'INT64'}, {'name': 'position', 'type': 'INT64'}, {'name': 'severity', 'type': 'STRING'}, {'name': 'message', 'type': 'STRING'}, {'name': 'ops', 'type': 'STRING', 'mode': 'REPEATED'} ]} # two different setups for create_disposition CREATE_IF_NEEDED # and write_disposition - WRITE_TRUNCATE output | 'store to BQ' >> beam.io.WriteToBigQuery( table_spec, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, method="STREAMING_INSERTS" ) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(known_args.output)