Exemple #1
0
 def test_read_all_single_file(self):
   file_name, expected_data = write_data(5)
   assert len(expected_data) == 5
   with TestPipeline() as pipeline:
     pcoll = pipeline | 'Create' >> Create(
         [file_name]) |'ReadAll' >> ReadAllFromText()
     assert_that(pcoll, equal_to(expected_data))
def run(argv=None):
    # argument parser
    parser = argparse.ArgumentParser()

    # pipeline options, google_cloud_options
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    setup_options = pipeline_options.view_as(SetupOptions)
    setup_options.save_main_session = True

    p = beam.Pipeline(options=pipeline_options)

    p1 = p | 'trigger from pubsub' >> beam.io.ReadFromPubSub(topic='projects/PROJECT_ID/topics/TOPIC_NAME_1') \
        | "convert msg to dict" >> beam.Map(lambda x: json.loads(x)) \
        | "extract filename" >> beam.Map(lambda x : 'gs://{}/{}'.format(x['bucket'], x['name'])) \
        | "read file" >> ReadAllFromText() \
        | 'split' >> beam.Map(lambda x: x.split(',')) \
        | 'format to dict' >> beam.Map(lambda x: {"id": x[0], "name": x[1]})

    # Write the messages
    output_rec = p1 | 'write to BigQuery' >> WriteToBigQuery(
        'PROJECT_ID:DATASET_ID.TABLE_NAME',
        schema='id:INTEGER, name:STRING',
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

    result = p.run()
    result.wait_until_finish()
Exemple #3
0
 def test_read_all_file_pattern(self):
   pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4])
   assert len(expected_data) == 40
   with TestPipeline() as pipeline:
     pcoll = (pipeline
              | 'Create' >> Create([pattern])
              |'ReadAll' >> ReadAllFromText())
     assert_that(pcoll, equal_to(expected_data))
Exemple #4
0
    def test_read_all_with_filename(self):
        pattern, expected_data = write_pattern([5, 3], return_filenames=True)
        assert len(expected_data) == 8

        with TestPipeline() as pipeline:
            pcoll = (pipeline
                     | 'Create' >> Create([pattern])
                     | 'ReadAll' >> ReadAllFromText(with_filename=True))
            assert_that(pcoll, equal_to(expected_data))
Exemple #5
0
 def test_read_all_gzip(self):
   _, lines = write_data(100)
   with TempDir() as tempdir:
     file_name = tempdir.create_temp_file()
     with gzip.GzipFile(file_name, 'wb') as f:
       f.write('\n'.join(lines).encode('utf-8'))
     with TestPipeline() as pipeline:
       pcoll = (pipeline
                | Create([file_name])
                | 'ReadAll' >> ReadAllFromText(
                    compression_type=CompressionTypes.GZIP))
       assert_that(pcoll, equal_to(lines))
 def test_read_all_gzip(self):
     _, lines = write_data(100)
     file_name = self._create_temp_file()
     with gzip.GzipFile(file_name, 'wb') as f:
         f.write('\n'.join(lines))
     pipeline = TestPipeline()
     pcoll = (pipeline
              | Create([file_name])
              | 'ReadAll' >>
              ReadAllFromText(compression_type=CompressionTypes.GZIP))
     assert_that(pcoll, equal_to(lines))
     pipeline.run()
Exemple #7
0
 def test_read_all_many_file_patterns(self):
   pattern1, expected_data1 = write_pattern([5, 3, 12, 8, 8, 4])
   assert len(expected_data1) == 40
   pattern2, expected_data2 = write_pattern([3, 7, 9])
   assert len(expected_data2) == 19
   pattern3, expected_data3 = write_pattern([11, 20, 5, 5])
   assert len(expected_data3) == 41
   expected_data = []
   expected_data.extend(expected_data1)
   expected_data.extend(expected_data2)
   expected_data.extend(expected_data3)
   with TestPipeline() as pipeline:
     pcoll = pipeline | 'Create' >> Create(
         [pattern1, pattern2, pattern3]) |'ReadAll' >> ReadAllFromText()
     assert_that(pcoll, equal_to(expected_data))
Exemple #8
0
 def test_read_all_many_single_files(self):
   file_name1, expected_data1 = write_data(5)
   assert len(expected_data1) == 5
   file_name2, expected_data2 = write_data(10)
   assert len(expected_data2) == 10
   file_name3, expected_data3 = write_data(15)
   assert len(expected_data3) == 15
   expected_data = []
   expected_data.extend(expected_data1)
   expected_data.extend(expected_data2)
   expected_data.extend(expected_data3)
   with TestPipeline() as pipeline:
     pcoll = pipeline | 'Create' >> Create(
         [file_name1, file_name2, file_name3]) |'ReadAll' >> ReadAllFromText()
     assert_that(pcoll, equal_to(expected_data))
Exemple #9
0
 def test_read_all_unavailable_files_ignored(self):
   file_name1, expected_data1 = write_data(5)
   assert len(expected_data1) == 5
   file_name2, expected_data2 = write_data(10)
   assert len(expected_data2) == 10
   file_name3, expected_data3 = write_data(15)
   assert len(expected_data3) == 15
   file_name4 = "/unavailable_file"
   expected_data = []
   expected_data.extend(expected_data1)
   expected_data.extend(expected_data2)
   expected_data.extend(expected_data3)
   with TestPipeline() as pipeline:
     pcoll = (pipeline
              | 'Create' >> Create(
                  [file_name1, file_name2, file_name3, file_name4])
              |'ReadAll' >> ReadAllFromText())
     assert_that(pcoll, equal_to(expected_data))
    wc[word] += 1

def splitLines(line):
    text = nltk.word_tokenize(line.lower().strip("\r\n"))
    filtered_sentence = [w for w in text if not w in stop_words]
    return filtered_sentence


from sklearn.feature_extraction.text import HashingVectorizer
vectorizer = HashingVectorizer(strip_accents='unicode'
                            , stop_words="stop_words")
def hashingVector(doc):
    reutrn vectorizer.fit_transform(doc)

(books
    | "Read Files" >> ReadAllFromText()
    | "Split Lines" >> beam.ParDo(splitLines)
    | "Clean Words" >> beam.ParDo(wordClean)
    | "Count Words" >> beam.ParDo(wordCount)
)


result = pipeline.run()
result.wait_until_finish()

import pandas as pd
df = pd.DataFrame({"word" : list(wc.keys()) , "count" : list(wc.values())})

#df.to_json("./test.json" , orient="records")
#df.sort_values("y" , ascending=False).iloc[:500].to_json("./test.json" , orient="records")
df.to_gbq("nlp.wordcounts" , "dsba6155" , if_exists='append')