def test_read_from_text_with_file_name_file_pattern(self): pattern, expected_data = write_pattern( lines_per_file=[5, 5], return_filenames=True) assert len(expected_data) == 10 with TestPipeline() as pipeline: pcoll = pipeline | 'Read' >> ReadFromTextWithFilename(pattern) assert_that(pcoll, equal_to(expected_data))
def test_read_from_text_with_file_name_single_file(self): file_name, data = write_data(5) expected_data = [(file_name, el) for el in data] assert len(expected_data) == 5 with TestPipeline() as pipeline: pcoll = pipeline | 'Read' >> ReadFromTextWithFilename(file_name) assert_that(pcoll, equal_to(expected_data))
def test_read_from_text_with_file_name_file_pattern(self): prefix = datetime.datetime.now().strftime("%Y%m%d%H%M%S") file_name_1, data_1 = write_data(5, prefix=prefix) file_name_2, data_2 = write_data(5, prefix=prefix) expected_data = [] expected_data.extend([(file_name_1, el) for el in data_1]) expected_data.extend([(file_name_2, el) for el in data_2]) folder = file_name_1[:file_name_1.rfind(os.path.sep)] pattern = folder + os.path.sep + prefix + '*' assert len(expected_data) == 10 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromTextWithFilename(pattern) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def run(argv=None, save_main_session=True): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://blog-data-resources/books_txt_files/**', help='Input file pattern to process.') parser.add_argument('--table_spec ', dest='table_spec', default='ilan-uzan-297514:tests.author_wordcount', help='Destination BigQuery table.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: table_schema = { 'fields': [{ 'name': 'author', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'word', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'cnt', 'type': 'INTEGER', 'mode': 'NULLABLE' }] } def to_json_row(element): key, cnt = element author, word = key return {"author": author, "word": word, "cnt": cnt} (p | 'Read files' >> ReadFromTextWithFilename(known_args.input) | 'Split lines' >> beam.ParDo(WordExtractingDoFn()) | 'Pair with 1' >> beam.Map(lambda x: ((x[0], x[1]), 1)) | 'Sum per author & word' >> beam.CombinePerKey(sum) | 'Format records to JSON' >> beam.Map(to_json_row) | 'Write to BigQuery' >> beam.io.WriteToBigQuery( known_args.table_spec, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))
def BuildPipeline(pathToFiles, compute_table_name): raw_output = "output-raw" final_output = "output-final" options = PipelineOptions() #pathToFiles= "C:\\Users\\Abhijeet\\Documents\\GitHub\\dsba6155project\\dsba6155project\\data\\**" pipeline = beam.Pipeline(options=options) vectors = ( pipeline | "Read Files" >> ReadFromTextWithFilename(pathToFiles) | "Group by File" >> beam.GroupByKey() | "Hashing Vectors" >> beam.ParDo(Hashing()) # | "Write CSV to biqquery" >> beam.io.WriteToBigQuery( # table=compute_table_name, # schema=GetSchema() # ) | "Write to file" >> WriteToText(raw_output) #| #| "Save" >> beam.ParDo(Save()) ) return pipeline
def BuildPipeline(pathToFiles, compute_table_name): raw_output = "output-raw" final_output = "output-final" options = PipelineOptions() #pathToFiles= "C:\\Users\\Abhijeet\\Documents\\GitHub\\dsba6155project\\dsba6155project\\data\\**" pipeline = beam.Pipeline(options=options) vectors = ( pipeline | "Read Files" >> ReadFromTextWithFilename(pathToFiles) | "Group by File" >> beam.GroupByKey() | "Hashing Vectors" >> beam.ParDo(Hashing()) | "Stack Em UP" >> beam.CombineGlobally(StackingFn()) | beam.ParDo(RunPCA) | "Write CSV to biqquery" >> beam.io.WriteToBigQuery( table=compute_table_name, schema=GetSchema(), write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, # create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED ) #| "Write to file" >> WriteToText(raw_output) #| #| "Save" >> beam.ParDo(Save()) ) return pipeline
def process(self, elem): return ReadFromTextWithFilename(elem)