def schemaMapping(self, fields): schema = {} for field in fields: if field['type'] == 'bool': schema[boa.constrict(field['id'])] = 'bool' else: schema[boa.constrict(field['id'])] = 'varchar' schema['id'] = 'int' return schema
def schemaMapping(self, fields): schema = {} for field in fields: if type(fields[field]) == int: schema[boa.constrict(field)] = 'INTEGER' elif type(fields[field]) == str: schema[boa.constrict(field)] = 'VARCHAR' elif type(fields[field]) == float: schema[boa.constrict(field)] = 'FLOAT' print(schema) return schema
def execute(self, context): response = self.get_data() response.columns = response.columns.map(boa.constrict) json_data = json.loads(response.to_json(orient='records')) schema_map = self.schemaMapping(json_data[0]) s3 = S3Hook(s3_conn_id=self.s3_conn_id) if self.s3_key.endswith('.json'): split = path.splitext(self.s3_key) schema_key = '{0}_schema{1}'.format(split[0], split[1]) results = [ dict([boa.constrict(k), v] for k, v in i.items()) for i in json_data ] results = '\n'.join([json.dumps(i) for i in results]) s3.load_string(string_data=str(schema_map), bucket_name=self.s3_bucket, key=schema_key, replace=True) s3.load_string(string_data=results, bucket_name=self.s3_bucket, key=self.s3_key, replace=True) s3.load_string s3.connection.close()
def outputManager(self, context, output, key, bucket): if len(output) == 0 or output is None: if self.total_output_files == 0: logging.info("No records pulled from Hubspot.") downstream_tasks = context['task'].get_flat_relatives(upstream=False) logging.info('Skipping downstream tasks...') logging.debug("Downstream task_ids %s", downstream_tasks) if downstream_tasks: self.skip(context['dag_run'], context['ti'].execution_date, downstream_tasks) else: logging.info('Logging {0} to GCS...'.format(key)) output = [flatten(e) for e in output] output = '\n'.join([json.dumps({boa.constrict(k): v for k, v in i.items()}) for i in output]) gcs = GoogleCloudStorageHook(self.gcs_conn_id) with open("__temp__", "w") as fid: fid.write(output) gcs_conn.upload(self.gcs_bucket, self.gcs_object, "__temp__") self.total_output_files += 1
def outputManager(self, context, output, key, bucket): if len(output) == 0 or output is None: if self.total_output_files == 0: logging.info("No records pulled from Hubspot.") downstream_tasks = context['task'].get_flat_relatives(upstream=False) logging.info('Skipping downstream tasks...') logging.debug("Downstream task_ids %s", downstream_tasks) if downstream_tasks: self.skip(context['dag_run'], context['ti'].execution_date, downstream_tasks) else: logging.info('Logging {0} to S3...'.format(key)) output = [flatten(e) for e in output] output = '\n'.join([json.dumps({boa.constrict(k): v for k, v in i.items()}) for i in output]) s3 = S3Hook(self.s3_conn_id) s3.load_string( string_data=str(output), key=key, bucket_name=bucket, replace=True ) s3.connection.close() self.total_output_files += 1
def download_attachments(self, search_criteria, output_dir, mailbox='INBOX'): downloaded_files = [] self.server.select(mailbox) search_result, emails = self.server.search(None, search_criteria) for mid in emails[0].split(): fetch_result, item = self.server.fetch(mid, "(BODY.PEEK[])") email_body = item[0][1] message = email.message_from_bytes(email_body) date = datetime.strptime( message['date'][:-6], "%a, %d %b %Y %H:%M:%S %z").strftime('%Y_%m_%d_%H_%M_%S') if message.get_content_maintype() != 'multipart': self.server.store(mid, '+FLAGS', '\Seen') continue for part in message.walk(): if (part.get_content_maintype() != 'multipart' and part.get('Content-Disposition') is not None): file_name = boa.constrict(part.get_filename().rsplit( '.', 1)[0]) ext = part.get_filename().rsplit('.', 1)[1] file_path = '{dir}/{date}_{filename}.{ext}'.format( dir=output_dir, date=date, filename=file_name, ext=ext) open(file_path, 'wb').write(part.get_payload(decode=True)) downloaded_files.append(file_path) self.server.store(mid, '+FLAGS', '\Seen') return downloaded_files
def outputManager(self, hook, output, key, bucket): """ This method handles the output of the data. """ if self.total_output_files == 0: logging.info("No records pulled.") if self.skip_if_null: downstream_tasks = self.context['task'].get_flat_relatives( upstream=False) logging.info('Skipping downstream tasks...') logging.debug("Downstream task_ids %s", downstream_tasks) if downstream_tasks: self.skip(self.context['dag_run'], self.context['ti'].execution_date, downstream_tasks) else: logging.info('Logging {0} to ...'.format(key)) output = [flatten(e) for e in output] output = '\n'.join([ json.dumps({boa.constrict(k): v for k, v in i.items()}) for i in output ]) if self.cs_type == 's3': hook.load_string(string_data=str(output), key=key, bucket_name=bucket, replace=True) hook.connection.close() self.total_output_files += 1
def output_manager(self, output): def flatten(record, parent_key='', sep='_'): flattened_record = [] for k, v in record.items(): new_key = parent_key + sep + k if parent_key else k if isinstance(v, dict): flattened_record.extend(flatten(v, new_key, sep=sep).items()) else: flattened_record.append((new_key, v)) return dict(flattened_record) output = '\n'.join([json.dumps({boa.constrict(k): v for k, v in flatten(record).items()}) for record in output]) s3 = S3Hook(self.s3_conn_id) s3.load_string( string_data=output, key=self.s3_key, bucket_name=self.s3_bucket, replace=True )
def output_manager(self, s3, output_name, output_data, context, sheet_name, schema_name=None): self.s3_bucket = BaseHook.get_connection(self.s3_conn_id).host if self.output_format == 'json': output = '\n'.join([json.dumps({boa.constrict(str(k)): v for k, v in record.items()}) for record in output_data]) enc_output = str.encode(output, 'utf-8') # if file is more than bound then apply gzip compression if len(enc_output) / 1024 / 1024 >= self.compression_bound: logging.info("File is more than {}MB, gzip compression will be applied".format(self.compression_bound)) output = gzip.compress(enc_output, compresslevel=5) self.xcom_push(context, key='is_compressed_{}'.format(sheet_name), value="compressed") self.load_bytes(s3, bytes_data=output, key=output_name, bucket_name=self.s3_bucket, replace=True ) else: logging.info("File is less than {}MB, compression will not be applied".format(self.compression_bound)) self.xcom_push(context, key='is_compressed_{}'.format(sheet_name), value="non-compressed") s3.load_string( string_data=output, key=output_name, bucket_name=self.s3_bucket, replace=True ) if self.include_schema is True: output_keys = output_data[0].keys() schema = [{'name': boa.constrict(a), 'type': 'varchar(512)'} for a in output_keys if a is not None] schema = {'columns': schema} s3.load_string( string_data=json.dumps(schema), key=schema_name, bucket_name=self.s3_bucket, replace=True ) logging.info('Successfully output of "{}" to S3.'.format(output_name))
def paginate_data(self, endpoint=None, payload=None): if not endpoint: endpoint = self.endpoint def make_request(http_conn_id, endpoint, payload=None, token=None): return (MarketoHook(http_conn_id=http_conn_id) .run(endpoint, payload, token=token) .json()) final_payload = {} for param in self.payload: final_payload[param] = self.payload[param] if payload: for param in payload: final_payload[param] = payload[param] response = make_request(self.marketo_conn_id, self.methodMapper(endpoint), final_payload, self.token) if endpoint == 'paging_token': return response['nextPageToken'] else: output = response['result'] if 'moreResult' in list(response.keys()): final_payload['moreResult'] = response['moreResult'] else: final_payload['moreResult'] = False while final_payload['moreResult']: response = make_request(self.marketo_conn_id, self.methodMapper(endpoint), final_payload, self.token) final_payload['nextPageToken'] if 'result' in (response.keys()): output += response['result'] if 'moreResult' in list(response.keys()): final_payload['moreResult'] = response['moreResult'] final_payload['nextPageToken'] = response['nextPageToken'] else: final_payload['moreResult'] = False else: final_payload['moreResult'] = False output = [{boa.constrict(k): v for k, v in i.items()} for i in output] return output
def create_dag(workflow, schedule_interval=None, dag_cls=None, dag_type=None): """ Creates a DAG instance from a workflow-like dict. Workflow objects are expected to have a name, schedule, and activityList :param workflow: The dict describing the DAG to build :type workflow: dict :param schedule_interval: A fallback schedule if a workflow does not define its own :type schedule_interval: string :param dag_type: describes the type of DAG being built :type dag_type: string :return DAG """ if not dag_cls: raise Exception('must pass DAG class to create_dag') # override default_args in workflow.default_args e.g. start_date workflow_args = workflow.get('default_args') if isinstance(workflow_args, dict): args = {**default_args, **workflow_args} else: args = default_args id_ = workflow.get('_id') workflow_name = boa.constrict(workflow.get('name', '').lower()) schedule = workflow.get('schedule', schedule_interval) if dag_type is not None: dag_name = '{workflow_name}__{dag_type}__{id_}'.format( workflow_name=workflow_name, dag_type=dag_type, id_=id_) else: dag_name = '{workflow_name}__{id_}'.format( workflow_name=workflow_name, id_=id_) print('Building DAG: {name}'.format(name=dag_name)) dag = dag_cls(dag_name, default_args=args, schedule_interval=schedule) create_tasks(dag, workflow) return dag
def create_dag(workflow, schedule_interval=None, dag_cls=None, dag_type=None): """ Creates a DAG instance from a workflow-like dict. Workflow objects are expected to have a name, schedule, and activityList :param workflow: The dict describing the DAG to build :type workflow: dict :param schedule_interval: A fallback schedule if a workflow does not define its own :type schedule_interval: string :param dag_type: describes the type of DAG being built :type dag_type: string :return DAG """ if not dag_cls: raise Exception('must pass DAG class to create_dag') # override default_args in workflow.default_args e.g. start_date workflow_args = workflow.get('default_args') if isinstance(workflow_args, dict): args = {**default_args, **workflow_args} else: args = default_args id_ = workflow.get('_id') workflow_name = boa.constrict(workflow.get('name', '').lower()) schedule = workflow.get('schedule', schedule_interval) if dag_type is not None: dag_name = '{workflow_name}__{dag_type}__{id_}'.format( workflow_name=workflow_name, dag_type=dag_type, id_=id_) else: dag_name = '{workflow_name}__{id_}'.format(workflow_name=workflow_name, id_=id_) print('Building DAG: {name}'.format(name=dag_name)) dag = dag_cls(dag_name, default_args=args, schedule_interval=schedule) create_tasks(dag, workflow) return dag
def execute(self, context): imap_conn = ImapHook(self.imap_conn_id) s3_conn = S3Hook(self.s3_conn_id) tmp_dir = '/tmp/{key}'.format(key=self.s3_key) if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) os.mkdir(tmp_dir) criteria = '(FROM "{imap_email}" SUBJECT "{imap_subject}" UNSEEN)'.format(imap_email=self.imap_email, imap_subject=self.imap_subject) attachments = imap_conn.download_attachments(criteria, tmp_dir) file_name = '{tmp_dir}/{key}.jsonl'.format(tmp_dir=tmp_dir, key=self.s3_key) s3_upload_file = open(file_name, 'w') for attachment in attachments: with open(attachment, 'r', errors='replace') as f: reader = csv.reader(f) headers = [boa.constrict(header) for header in next(reader)] for row in reader: json_line = {} for index, col in enumerate(row): json_line[headers[index]] = col json.dump(json_line, s3_upload_file) s3_upload_file.write('\n') s3_upload_file.close() s3_conn.load_file(file_name, self.s3_key, self.s3_bucket, True) shutil.rmtree(tmp_dir)
def getSalesforceRecords(name, **kwargs): sf = get_salesforce_conn() formatted_name = "{}.json".format(name.lower()) templates_dict = kwargs.get('templates_dict', {}) fields = json.loads(templates_dict.get('fields', '[]')) query_string = "SELECT {0} FROM {1}".format(','.join(fields), name) print(query_string) response = sf.query_all(query_string) output = response['records'] output = '\n'.join([json.dumps(flatten({boa.constrict(k): v\ for k, v in i.items()})) for i in output]) with NamedTemporaryFile("w") as f: f.write(output) s3_key = 'salesforce/{}'.format(formatted_name) s3 = S3Hook(s3_conn_id='INSERT_S3_CONN_ID_HERE') s3.load_file( filename=f.name, key=s3_key, bucket_name='INSERT_S3_BUCKET_NAME_HERE', replace=True ) s3.connection.close() return s3_key
def paginate_data(self, h, endpoint, context, company_id=None, campaign_id=None): """ This method takes care of request building and pagination. It retrieves 100 at a time and continues to make subsequent requests until it retrieves less than 100 records. """ output = [] try: initial_offset = Variable.get('INCREMENTAL_KEY__{0}_{1}_vidOffset'.format(context['ti'].dag_id, context['ti'].task_id)) print('INITIAL OFFSET: ' + str(initial_offset)) except: initial_offset = 0 final_payload = {'vidOffset': initial_offset} if self.hubspot_object in ('events', 'timeline'): final_payload['limit'] = 1000 elif self.hubspot_object == 'deals': final_payload['limit'] = 250 elif self.hubspot_object == 'contacts': final_payload['count'] = 100 for param in self.hubspot_args: # If time used as filter in request and is a string object # (e.g. when using {{ execution_date}}), convert the timestamp # to Hubspot formatting as needed by Hubspot API. if param in ('startTimestamp', 'endTimestamp'): param_time = datetime.datetime.strptime(self.hubspot_args[param], "%Y-%m-%d %H:%M:%S") self.hubspot_args[param] = int(time.mktime(param_time.timetuple()) * 1000) final_payload[param] = self.hubspot_args[param] logging.info('FINAL PAYLOAD: ' + str(final_payload)) response = h.run(endpoint, final_payload).json() if not response: logging.info('Resource Unavailable.') return '' if self.hubspot_object == 'owners': output.extend([e for e in response]) # output = [self.filterMapper(record) for record in output] output = self.subTableMapper(output) return output elif self.hubspot_object == 'engagements': output.extend([e for e in response['results']]) elif self.hubspot_object == 'contacts_by_company': if endpoint == 'companies/v2/companies/paged': if response['companies']: output.extend([e for e in response['companies']]) else: logging.info('No companies currently available.') return '' else: output.extend([{"vid": e, "company_id": company_id} for e in response['vids']]) elif self.hubspot_object == 'campaigns': if 'email/public/v1/campaigns/' in endpoint: output.append(response) elif self.hubspot_object in ('deal_pipelines', 'social'): output.extend([e for e in response]) else: output.extend([e for e in response[self.hubspot_object]]) if isinstance(response, dict): if 'hasMore' in list(response.keys()): more = 'hasMore' elif 'has-more' in list(response.keys()): more = 'has-more' else: more = 'has-more' response['has-more'] = False n = 0 if 'vid-offset' in list(response.keys()): offset_variable = 'vid-offset' elif 'offset' in list(response.keys()): offset_variable = 'offset' while response[more] is True: if offset_variable == 'vid-offset': final_payload['vidOffset'] = response['vid-offset'] logging.info('Retrieving: ' + str(response['vid-offset'])) elif offset_variable == 'offset': final_payload['offset'] = response['offset'] logging.info('Retrieving: ' + str(response['offset'])) try: response = h.run(endpoint, final_payload).json() except: pass if endpoint == 'companies/v2/companies/paged': if response['companies']: output.extend([e for e in response['companies']]) else: output.extend([e for e in response[self.hubspot_object]]) n += 1 time.sleep(0.2) if n % 50 == 0: # output = [self.filterMapper(record) for record in output] output = self.subTableMapper(output) if self.hubspot_object == 'contacts_by_company': companies = self.retrieve_data(h, self.methodMapper('companies')) if not companies: logging.info('No companies currently available.') downstream_tasks = context['task'].get_flat_relatives(upstream=False) logging.info('Skipping downstream tasks...') logging.debug("Downstream task_ids %s", downstream_tasks) if downstream_tasks: self.skip(context['dag_run'], context['ti'].execution_date, downstream_tasks) return True final_output = [] for company in companies: final_output.extend(output) key = '{0}_core_{1}{2}'.format(self.split[0], str(n), self.split[1]) self.outputManager(context, output, key, self.s3_bucket) else: for e in output: for k, v in e.items(): if k == 'core': key = '{0}_core_{1}{2}'.format(self.split[0], str(n), self.split[1]) else: key = '{0}_{1}_{2}{3}'.format(self.split[0], boa.constrict(k), str(n), self.split[1]) logging.info('Sending to Output Manager...') self.outputManager(context, v, key, self.s3_bucket) if self.hubspot_object == 'contacts': if response[offset_variable] == 0: logging.info('No new records received.') logging.info('Offset variable is still: ' + str(initial_offset)) else: new_offset = ('INCREMENTAL_KEY__{0}_{1}_vidOffset' .format(context['ti'].dag_id, context['ti'].task_id)) logging.info('New Variable offset is now: ' +\ str(response[offset_variable])) Variable.set(new_offset, response[offset_variable]) output = [] if self.hubspot_object == 'contacts': if response[offset_variable] == 0: logging.info('No new records received.') logging.info('Offset variable is still: ' + str(initial_offset)) else: new_offset = ('INCREMENTAL_KEY__{0}_{1}_vidOffset' .format(context['ti'].dag_id, context['ti'].task_id)) logging.info('New Variable offset is now: ' + str(response[offset_variable])) Variable.set(new_offset, response[offset_variable]) # output = [self.filterMapper(record) for record in output] output = self.subTableMapper(output) return output
"department": "varchar", "job_title": "varchar", "reports_to": "varchar", } elif self.method == 'getEmploymentStatus': schema = { "id": "int", "employee_id": "int", "date": "date", "employment_status": "varchar", "benetrac_status": "varchar", "gusto": "varchar", } results = [dict([boa.constrict(k), v] for k, v in i.items()) for i in results] results = '\n'.join([json.dumps(i) for i in results]) s3.load_string( string_data=json.dumps(schema), bucket_name=self.s3_bucket, key=schema_key, replace=True ) s3.load_string( string_data=results, bucket_name=self.s3_bucket, key=self.s3_key, replace=True
def execute(self, context): g_conn = GoogleHook(self.google_conn_id) if isinstance(self.sheet_names, str) and ',' in self.sheet_names: sheet_names = self.sheet_names.split(',') else: sheet_names = self.sheet_names sheets_object = g_conn.get_service_object('sheets', 'v4', [ 'https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive' ]) print('Retrieved Sheets Object') response = sheets_object.spreadsheets().get( spreadsheetId=self.sheet_id, includeGridData=True).execute() title = response.get('properties').get('title') sheets = response.get('sheets') final_output = dict() total_sheets = [] for sheet in sheets: name = sheet.get('properties').get('title') total_sheets.append(name) if self.sheet_names: if name not in sheet_names: print('{} is not found in available sheet names.'.format( name)) continue table_name = name data = sheet.get('data')[0].get('rowData') output = [] for row in data: row_data = [] values = row.get('values') for value in values: ev = value.get('effectiveValue') if ev is None: row_data.append(None) else: for v in ev.values(): row_data.append(v) output.append(row_data) if self.output_format == 'json': headers = output.pop(0) output = [dict(zip(headers, row)) for row in output] final_output[table_name] = output s3 = S3Hook(self.s3_conn_id) for sheet in final_output: output_data = final_output.get(sheet) file_name = os.path.splitext(self.s3_path)[0] sheet = boa.constrict(sheet) output_name = ''.join( [self.s3_path, '/', sheet, '.', self.output_format]) if self.include_schema is True: schema_name = ''.join([ self.s3_path, '/', sheet, '_schema', '.', self.output_format ]) else: schema_name = None self.output_manager(s3, output_name, output_data, context, sheet, schema_name) dag_id = context['ti'].dag_id var_key = '_'.join([dag_id, self.sheet_id]) Variable.set(key=var_key, value=json.dumps(total_sheets)) time.sleep(10) return boa.constrict(title)
def execute(self, context): self.token = (MarketoHook(http_conn_id=self.marketo_conn_id) .run(self.methodMapper('auth')) .json())['access_token'] if self.endpoint == 'activities': paging_token = self.paginate_data(endpoint='paging_token', payload={'sinceDatetime': '2014-01-01T00:00:00'}) activity_types = self.paginate_data(endpoint='activity_types') activities = [activity['id'] for activity in activity_types] output = [] output += self.paginate_data(payload={'activityTypeIds': activities[0], 'nextPageToken': paging_token}) elif self.endpoint == 'leads': request = {} lead_fields = self.paginate_data(endpoint='lead_description') request['fields'] = [record['rest']['name'] for record in lead_fields] request['columnHeaderNames'] = {record['rest']['name']: record['rest']['name'] for record in lead_fields} request['filter'] = {} createdAt = {} createdAt['startAt'] = self.start_at createdAt['endAt'] = self.end_at request['filter']['updatedAt'] = createdAt request['format'] = 'CSV' get_hook = MarketoHook(http_conn_id=self.marketo_conn_id) post_hook = MarketoHook(method='POST', http_conn_id=self.marketo_conn_id) job = post_hook.run(self.methodMapper('leads_create'), data=json.dumps(request), token=self.token).json() export_id = [e['exportId'] for e in job['result']][0] status = [e['status'] for e in post_hook.run('bulk/v1/leads/export/{0}/enqueue.json'.format(export_id), token=self.token).json()['result']][0] while status != 'Completed': status = [e['status'] for e in get_hook.run('bulk/v1/leads/export/{0}/status.json'.format(export_id), token=self.token).json()['result']][0] logging.info('Status: ' + str(status)) sleep(5) output = get_hook.run('bulk/v1/leads/export/{0}/file.json'.format(export_id), token=self.token).text output = output.split('\n') headers = output.pop(0).split(',') del output[0] headers = [boa.constrict(header) for header in headers] output = [row for row in reader(output)] output = [dict(zip(headers, row)) for row in output] marketo_schema = schema[self.endpoint] field_names = [] for field in marketo_schema['fields']: field_names.append(field['name']) logging.info('DIFF: ' + str(set(headers) - set(field_names))) else: output = self.paginate_data() logging.info(len('Output Length: ' + str(output))) if len(output) == 0 or output is None: logging.info("No records pulled from Marketo.") downstream_tasks = context['task'].get_flat_relatives(upstream=False) logging.info('Skipping downstream tasks...') logging.debug("Downstream task_ids %s", downstream_tasks) if downstream_tasks: self.skip(context['dag_run'], context['ti'].execution_date, downstream_tasks) return True else: self.outputManager(self.nullify_output(output), self.s3_key, self.s3_bucket, self.output_format)
def imap_py(**kwargs): selenium_conn_id = kwargs.get('templates_dict', None).get('selenium_conn_id', None) filename = kwargs.get('templates_dict', None).get('filename', None) s3_conn_id = kwargs.get('templates_dict', None).get('s3_conn_id', None) s3_bucket = kwargs.get('templates_dict', None).get('s3_bucket', None) s3_key = kwargs.get('templates_dict', None).get('s3_key', None) date = kwargs.get('templates_dict', None).get('date', None) @provide_session def get_conn(conn_id, session=None): conn = (session.query(Connection).filter( Connection.conn_id == conn_id).first()) return conn url = get_conn(selenium_conn_id).host email = get_conn(selenium_conn_id).user pwd = get_conn(selenium_conn_id).password vdisplay = Xvfb() vdisplay.start() caps = webdriver.DesiredCapabilities.FIREFOX caps["marionette"] = True profile = webdriver.FirefoxProfile() profile.set_preference("browser.download.manager.showWhenStarting", False) profile.set_preference('browser.helperApps.neverAsk.saveToDisk', "text/csv") logging.info('Profile set...') options = Options() options.set_headless(headless=True) logging.info('Options set...') logging.info('Initializing Driver...') driver = webdriver.Firefox(firefox_profile=profile, firefox_options=options, capabilities=caps) logging.info('Driver Intialized...') driver.get(url) logging.info('Authenticating...') elem = driver.find_element_by_id("email") elem.send_keys(email) elem = driver.find_element_by_id("password") elem.send_keys(pwd) elem.send_keys(Keys.RETURN) logging.info('Successfully authenticated.') sleep_time = 15 logging.info('Downloading File....Sleeping for {} Seconds.'.format( str(sleep_time))) time.sleep(sleep_time) driver.close() vdisplay.stop() dest_s3 = S3Hook(s3_conn_id=s3_conn_id) os.chdir('/root/Downloads') csvfile = open(filename, 'r') output_json = 'file.json' with open(output_json, 'w') as jsonfile: reader = csv.DictReader(csvfile) for row in reader: row = dict((boa.constrict(k), v) for k, v in row.items()) row['run_date'] = date json.dump(row, jsonfile) jsonfile.write('\n') dest_s3.load_file(filename=output_json, key=s3_key, bucket_name=s3_bucket, replace=True) dest_s3.connection.close()