def setUp(self): # Create local job for upload warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*<ssl.SSLSocket.*>") jobs.generate_job('test', 'test', dry_run=False) # Tests generate_job to some extent self.jobs = jobs.get_jobs(job_type='test')
def tvf_business(filename): print('Beginning business data transformation.') data = load_data(filename) savepaths = save_chunks(data=data, max_size=30000, prefix='clean_transformed', rootname='business', path='Processed/', filetype='json') for path in savepaths: generate_job(objectpath=path, job_type='POST')
def tvf_review(filename): print('Beginning review data transformation.') data = load_data(filename) savepaths = save_chunks( data=data, max_size=50000, prefix='clean', rootname='review', path='Clean/', ) for path in savepaths: generate_job(objectpath=path, job_type='NLP')
def tvf_photo(filename): print('Beginning photo data transformation.') data = load_data(filename) savepaths = save_chunks( data=data, max_size=100000, prefix='clean', rootname='photo', path='Clean/', ) for path in savepaths: generate_job(objectpath=path, job_type='POST')
def tvf_tips(filename): print('Beginning tip data transformation.') data = load_data(filename) data['tip_id'] = data.apply(generate_id, axis=1) savepaths = save_chunks( data=data, max_size=100000, prefix='clean', rootname='tip', path='Clean/', ) for path in savepaths: generate_job(objectpath=path, job_type='NLP')
def tvf_viz2(filename): print('Beginning Vizualization 2 transformation.') data = load_data(filename) savepaths = save_chunks(data=data, max_size=20000, prefix='processed', rootname='viz2', path='Processed/', filetype='json') for path in savepaths: generate_job(objectpath=path, tablename='viz2', job_type='POST', dry_run=False)
def tvf_checkin(filename): print('Beginning checkin data transformation.') data = load_data(filename) data['checkin_id'] = data.apply(generate_id, axis=1) data = data.rename(columns={'date': 'dates'}) savepaths = save_chunks( data=data, max_size=20000, prefix='clean', rootname='checkin', path='Clean/', ) for path in savepaths: generate_job(objectpath=path, job_type='POST')
def test_generate_job(self): # Check that save paths and job data match inputs (dry-run test) # generate jobs returns (temp_job_path, job_name, job_data) objectpath = 'test_object_path' job_type = 'TEST' tablename = 'test_table' job_name = ''.join( [job_type, '_', objectpath.split('/')[-1], '_job.json']) temp_job_path = '/tmp/' + job_name job_data = { 'file': objectpath, 'tablename': tablename, 'test_kwarg': "test_data_value", } job_return = jobs.generate_job( objectpath='test_object_path', job_type='TEST', tablename='test_table', test_kwarg="test_data_value", ) self.assertListEqual(list((temp_job_path, job_name, job_data)), list(job_return))
get_jobs(job_type='retoken') current_job = pop_current_job() asset = read_job(current_job)['File'] main_logger.info('Running job {}. Read file {}'.format(current_job, asset)) # Load the data datapath = download_data(asset) data = load_data(datapath) # Run NLP Process start = time.time() output = filter_tokens(data) output = output.filter(['review_id', 'tip_id', 'token', 'lemma']) stop = time.time() main_logger.info("{} processed in {}".format(len(data), stop-start)) # Write Data to s3 savepath = 'Processed/' + asset.split('/')[-1].split('.')[0] + '_retoken' write_data(data=output, savepath=savepath, dry_run=False) # Generate POST Job generate_job(savepath, 'POST') # Cleanup delete_local_file(datapath) delete_s3_file(current_job) main_logger.info("Deleted Job: {}".format(current_job)) break # Test break (1 at a time)
main_logger = logging.getLogger(__name__+" Sentiment Adder") num_jobs = len(get_jobs('sentiment')) # No module creates sentiment jobs. Manually create these. for i in range(num_jobs): # Get a job and read out the datapath current_job = pop_current_job() asset = read_job(current_job).get('file') main_logger.info('Running job {}. Read file {}'.format(current_job, asset)) # Load the data datapath = download_data(asset) data = load_data(datapath) sentiment_df = add_sentiment(data) # Write Data to s3 savepath = asset+'_sentiment' write_data(data=sentiment_df, savepath=savepath, dry_run=False) # Generate POST Job review = asset.split('_')[1] == 'review' if review: generate_job(savepath, 'POST', tablename='review_sentiment', dry_run=False) else: generate_job(savepath, 'POST', tablename='tip_sentiment', dry_run=False) # Cleanup delete_local_file(datapath) delete_s3_file(current_job) main_logger.info("Deleted Job: {}".format(current_job))