Ejemplo n.º 1
0
 def test_read_from_text_with_file_name_file_pattern(self):
   pattern, expected_data = write_pattern(
       lines_per_file=[5, 5], return_filenames=True)
   assert len(expected_data) == 10
   with TestPipeline() as pipeline:
     pcoll = pipeline | 'Read' >> ReadFromTextWithFilename(pattern)
     assert_that(pcoll, equal_to(expected_data))
Ejemplo n.º 2
0
 def test_read_from_text_with_file_name_single_file(self):
   file_name, data = write_data(5)
   expected_data = [(file_name, el) for el in data]
   assert len(expected_data) == 5
   with TestPipeline() as pipeline:
     pcoll = pipeline | 'Read' >> ReadFromTextWithFilename(file_name)
     assert_that(pcoll, equal_to(expected_data))
Ejemplo n.º 3
0
 def test_read_from_text_with_file_name_file_pattern(self):
   prefix = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
   file_name_1, data_1 = write_data(5, prefix=prefix)
   file_name_2, data_2 = write_data(5, prefix=prefix)
   expected_data = []
   expected_data.extend([(file_name_1, el) for el in data_1])
   expected_data.extend([(file_name_2, el) for el in data_2])
   folder = file_name_1[:file_name_1.rfind(os.path.sep)]
   pattern = folder + os.path.sep + prefix + '*'
   assert len(expected_data) == 10
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromTextWithFilename(pattern)
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
Ejemplo n.º 4
0
def run(argv=None, save_main_session=True):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default='gs://blog-data-resources/books_txt_files/**',
                        help='Input file pattern to process.')
    parser.add_argument('--table_spec ',
                        dest='table_spec',
                        default='ilan-uzan-297514:tests.author_wordcount',
                        help='Destination BigQuery table.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    with beam.Pipeline(options=pipeline_options) as p:
        table_schema = {
            'fields': [{
                'name': 'author',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'word',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'cnt',
                'type': 'INTEGER',
                'mode': 'NULLABLE'
            }]
        }

        def to_json_row(element):
            key, cnt = element
            author, word = key

            return {"author": author, "word": word, "cnt": cnt}

        (p
         | 'Read files' >> ReadFromTextWithFilename(known_args.input)
         | 'Split lines' >> beam.ParDo(WordExtractingDoFn())
         | 'Pair with 1' >> beam.Map(lambda x: ((x[0], x[1]), 1))
         | 'Sum per author & word' >> beam.CombinePerKey(sum)
         | 'Format records to JSON' >> beam.Map(to_json_row)
         | 'Write to BigQuery' >> beam.io.WriteToBigQuery(
             known_args.table_spec,
             schema=table_schema,
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))
Ejemplo n.º 5
0
def BuildPipeline(pathToFiles, compute_table_name):
    raw_output = "output-raw"
    final_output = "output-final"
    options = PipelineOptions()
    #pathToFiles= "C:\\Users\\Abhijeet\\Documents\\GitHub\\dsba6155project\\dsba6155project\\data\\**"
    pipeline = beam.Pipeline(options=options)
    vectors = (
        pipeline
        | "Read Files" >> ReadFromTextWithFilename(pathToFiles)
        | "Group by File" >> beam.GroupByKey()
        | "Hashing Vectors" >> beam.ParDo(Hashing())
        # | "Write CSV to biqquery" >> beam.io.WriteToBigQuery(
        #     table=compute_table_name,
        #     schema=GetSchema()
        # )
        | "Write to file" >> WriteToText(raw_output)
        #|
        #| "Save" >> beam.ParDo(Save())
    )
    return pipeline
def BuildPipeline(pathToFiles, compute_table_name):
    raw_output = "output-raw"
    final_output = "output-final"
    options = PipelineOptions()
    #pathToFiles= "C:\\Users\\Abhijeet\\Documents\\GitHub\\dsba6155project\\dsba6155project\\data\\**"
    pipeline = beam.Pipeline(options=options)
    vectors = (
        pipeline
        | "Read Files" >> ReadFromTextWithFilename(pathToFiles)
        | "Group by File" >> beam.GroupByKey()
        | "Hashing Vectors" >> beam.ParDo(Hashing())
        | "Stack Em UP" >> beam.CombineGlobally(StackingFn())
        | beam.ParDo(RunPCA)
        | "Write CSV to biqquery" >> beam.io.WriteToBigQuery(
            table=compute_table_name,
            schema=GetSchema(),
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
            # create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
        )
        #| "Write to file" >> WriteToText(raw_output)
        #|
        #| "Save" >> beam.ParDo(Save())
    )
    return pipeline
Ejemplo n.º 7
0
 def process(self, elem):
     return ReadFromTextWithFilename(elem)