def setUp(self):
     # Create local job for upload
     warnings.filterwarnings("ignore",
                             category=ResourceWarning,
                             message="unclosed.*<ssl.SSLSocket.*>")
     jobs.generate_job('test', 'test',
                       dry_run=False)  # Tests generate_job to some extent
     self.jobs = jobs.get_jobs(job_type='test')
def tvf_business(filename):
    print('Beginning business data transformation.')
    data = load_data(filename)
    savepaths = save_chunks(data=data,
                            max_size=30000,
                            prefix='clean_transformed',
                            rootname='business',
                            path='Processed/',
                            filetype='json')
    for path in savepaths:
        generate_job(objectpath=path, job_type='POST')
def tvf_review(filename):
    print('Beginning review data transformation.')
    data = load_data(filename)
    savepaths = save_chunks(
        data=data,
        max_size=50000,
        prefix='clean',
        rootname='review',
        path='Clean/',
    )
    for path in savepaths:
        generate_job(objectpath=path, job_type='NLP')
def tvf_photo(filename):
    print('Beginning photo data transformation.')
    data = load_data(filename)
    savepaths = save_chunks(
        data=data,
        max_size=100000,
        prefix='clean',
        rootname='photo',
        path='Clean/',
    )
    for path in savepaths:
        generate_job(objectpath=path, job_type='POST')
def tvf_tips(filename):
    print('Beginning tip data transformation.')
    data = load_data(filename)
    data['tip_id'] = data.apply(generate_id, axis=1)
    savepaths = save_chunks(
        data=data,
        max_size=100000,
        prefix='clean',
        rootname='tip',
        path='Clean/',
    )
    for path in savepaths:
        generate_job(objectpath=path, job_type='NLP')
def tvf_viz2(filename):
    print('Beginning Vizualization 2 transformation.')
    data = load_data(filename)
    savepaths = save_chunks(data=data,
                            max_size=20000,
                            prefix='processed',
                            rootname='viz2',
                            path='Processed/',
                            filetype='json')
    for path in savepaths:
        generate_job(objectpath=path,
                     tablename='viz2',
                     job_type='POST',
                     dry_run=False)
def tvf_checkin(filename):
    print('Beginning checkin data transformation.')
    data = load_data(filename)
    data['checkin_id'] = data.apply(generate_id, axis=1)
    data = data.rename(columns={'date': 'dates'})
    savepaths = save_chunks(
        data=data,
        max_size=20000,
        prefix='clean',
        rootname='checkin',
        path='Clean/',
    )
    for path in savepaths:
        generate_job(objectpath=path, job_type='POST')
    def test_generate_job(self):
        # Check that save paths and job data match inputs (dry-run test)
        # generate jobs returns (temp_job_path, job_name, job_data)
        objectpath = 'test_object_path'
        job_type = 'TEST'
        tablename = 'test_table'
        job_name = ''.join(
            [job_type, '_',
             objectpath.split('/')[-1], '_job.json'])
        temp_job_path = '/tmp/' + job_name

        job_data = {
            'file': objectpath,
            'tablename': tablename,
            'test_kwarg': "test_data_value",
        }

        job_return = jobs.generate_job(
            objectpath='test_object_path',
            job_type='TEST',
            tablename='test_table',
            test_kwarg="test_data_value",
        )

        self.assertListEqual(list((temp_job_path, job_name, job_data)),
                             list(job_return))
Beispiel #9
0
        get_jobs(job_type='retoken')
        current_job = pop_current_job()
        asset = read_job(current_job)['File']

        main_logger.info('Running job {}.  Read file {}'.format(current_job, asset))

        # Load the data
        datapath = download_data(asset)
        data = load_data(datapath)

        # Run NLP Process
        start = time.time()

        output = filter_tokens(data)
        output = output.filter(['review_id', 'tip_id', 'token', 'lemma'])

        stop = time.time()
        main_logger.info("{} processed in {}".format(len(data), stop-start))

        # Write Data to s3
        savepath = 'Processed/' + asset.split('/')[-1].split('.')[0] + '_retoken'
        write_data(data=output, savepath=savepath, dry_run=False)

        # Generate POST Job
        generate_job(savepath, 'POST')

        # Cleanup
        delete_local_file(datapath)
        delete_s3_file(current_job)
        main_logger.info("Deleted Job: {}".format(current_job))
        break # Test break (1 at a time)
    main_logger = logging.getLogger(__name__+" Sentiment Adder")

    num_jobs = len(get_jobs('sentiment')) # No module creates sentiment jobs.  Manually create these.

    for i in range(num_jobs):
        # Get a job and read out the datapath
        current_job = pop_current_job()
        asset = read_job(current_job).get('file')

        main_logger.info('Running job {}.  Read file {}'.format(current_job, asset))

        # Load the data
        datapath = download_data(asset)
        data = load_data(datapath)
        sentiment_df = add_sentiment(data)

        # Write Data to s3
        savepath = asset+'_sentiment'
        write_data(data=sentiment_df, savepath=savepath, dry_run=False)
        
        # Generate POST Job
        review = asset.split('_')[1] == 'review'
        if review:
            generate_job(savepath, 'POST', tablename='review_sentiment', dry_run=False)
        else:
            generate_job(savepath, 'POST', tablename='tip_sentiment', dry_run=False)

        # Cleanup
        delete_local_file(datapath)
        delete_s3_file(current_job)
        main_logger.info("Deleted Job: {}".format(current_job))